From 5b5271a33aa01391dfdef48c0453b990c866e088 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 24 Aug 2023 17:34:41 +0800
Subject: [PATCH 01/89] minor changes to benchmarking scripts

---
 scripts/local_bench.tmp.py | 2 +-
 scripts/local_client.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 714eac67..6f1ceb78 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -110,7 +110,7 @@ def bench_round(protocol, num_replicas, value_size, put_ratio, length_s):
 if __name__ == "__main__":
     do_cargo_build()
 
-    for num_replicas in (3, 7):
+    for num_replicas in (3, 5, 7):
         for value_size in (1024, 65536, 4194304):
             for protocol in ("MultiPaxos", "RSPaxos"):
                 bench_round(protocol, num_replicas, value_size, 100, 60)
diff --git a/scripts/local_client.py b/scripts/local_client.py
index c0f46adf..04347398 100644
--- a/scripts/local_client.py
+++ b/scripts/local_client.py
@@ -71,7 +71,7 @@ def compose_client_cmd(protocol, manager, config, utility, params, release):
 
     # if in benchmarking mode, lower the client's CPU scheduling priority
     if utility == "bench":
-        cmd = ["nice", "-n", "15"] + cmd
+        cmd = ["nice", "-n", "19"] + cmd
 
     return cmd
 

From 91d8778aae6902509ea1b3b434c07831308d44da Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 24 Aug 2023 20:06:10 +0800
Subject: [PATCH 02/89] minor updates to README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 3e2fe00a..3ccadae9 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,7 @@ Summerset is a distributed key-value store supporting a wide range of state mach
 | `RepNothing` | Simplest protocol w/o any replication |
 | `SimplePush` | Pushing to peers w/o any consistency guarantees |
 | `MultiPaxos` | Classic [MultiPaxos](https://www.microsoft.com/en-us/research/uploads/prod/2016/12/paxos-simple-Copy.pdf) protocol |
+| `RS-Paxos` | MultiPaxos w/ Reed-Solomon erasure code sharding |
 
 Formal TLA+ specification of some protocols are provided in `tla+/`.
 

From 7979ddac1663a9bccc03a598f5d67bd4fb6580cd Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Mon, 28 Aug 2023 16:54:27 +0800
Subject: [PATCH 03/89] add very basic crossword impl

---
 scripts/local_client.py    |    1 +
 scripts/local_cluster.py   |    1 +
 src/lib.rs                 |    2 +
 src/protocols/crossword.rs | 1410 ++++++++++++++++++++++++++++++++++++
 src/protocols/mod.rs       |   18 +
 src/protocols/rs_paxos.rs  |    2 +-
 6 files changed, 1433 insertions(+), 1 deletion(-)
 create mode 100644 src/protocols/crossword.rs

diff --git a/scripts/local_client.py b/scripts/local_client.py
index 04347398..2f9c2c6d 100644
--- a/scripts/local_client.py
+++ b/scripts/local_client.py
@@ -26,6 +26,7 @@ def run_process(cmd):
     "SimplePush": "",
     "MultiPaxos": "",
     "RSPaxos": "",
+    "Crossword": "",
 }
 
 
diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index a1d33351..b7dcdb25 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -40,6 +40,7 @@ def kill_all_matching(name):
     "SimplePush": lambda r, n: f"backer_path='/tmp/summerset.simple_push.{r}.wal'+rep_degree={n-1}",
     "MultiPaxos": lambda r, n: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'",
     "RSPaxos": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance={n-(n//2+1)}",
+    "Crossword": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance=0+shards_per_replica=3",
 }
 
 
diff --git a/src/lib.rs b/src/lib.rs
index f5cf4126..40bcbf31 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -35,3 +35,5 @@ pub use crate::protocols::SmrProtocol;
 pub use crate::protocols::{ReplicaConfigRepNothing, ClientConfigRepNothing};
 pub use crate::protocols::{ReplicaConfigSimplePush, ClientConfigSimplePush};
 pub use crate::protocols::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos};
+pub use crate::protocols::{ReplicaConfigRSPaxos, ClientConfigRSPaxos};
+pub use crate::protocols::{ReplicaConfigCrossword, ClientConfigCrossword};
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
new file mode 100644
index 00000000..dfe647ea
--- /dev/null
+++ b/src/protocols/crossword.rs
@@ -0,0 +1,1410 @@
+//! Replication protocol: Crossword.
+//!
+//! MultiPaxos with flexible Reed-Solomon erasure coding that supports tunable
+//! shard groups and asymmetric shard assignment.
+
+use std::collections::{HashMap, HashSet};
+use std::path::Path;
+use std::net::SocketAddr;
+
+use crate::utils::{SummersetError, ReplicaMap, RSCodeword};
+use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
+use crate::server::{
+    ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi,
+    ApiRequest, ApiReply, StorageHub, LogAction, LogResult, LogActionId,
+    TransportHub, GenericReplica,
+};
+use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint};
+use crate::protocols::SmrProtocol;
+
+use async_trait::async_trait;
+
+use serde::{Serialize, Deserialize};
+
+use tokio::time::Duration;
+
+use reed_solomon_erasure::galois_8::ReedSolomon;
+
+/// Configuration parameters struct.
+#[derive(Debug, Deserialize)]
+pub struct ReplicaConfigCrossword {
+    /// Client request batching interval in microsecs.
+    pub batch_interval_us: u64,
+
+    /// Client request batching maximum batch size.
+    pub max_batch_size: usize,
+
+    /// Path to backing file.
+    pub backer_path: String,
+
+    /// Whether to call `fsync()`/`fdatasync()` on logger.
+    pub logger_sync: bool,
+
+    /// Fault-tolerance level.
+    pub fault_tolerance: u8,
+
+    /// Number of shards to assign to each replica.
+    // TODO: proper config options.
+    pub shards_per_replica: u8,
+}
+
+#[allow(clippy::derivable_impls)]
+impl Default for ReplicaConfigCrossword {
+    fn default() -> Self {
+        ReplicaConfigCrossword {
+            batch_interval_us: 1000,
+            max_batch_size: 5000,
+            backer_path: "/tmp/summerset.rs_paxos.wal".into(),
+            logger_sync: false,
+            fault_tolerance: 0,
+            shards_per_replica: 1,
+        }
+    }
+}
+
+/// Ballot number type. Use 0 as a null ballot number.
+type Ballot = u64;
+
+/// Instance status enum.
+#[derive(
+    Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Serialize, Deserialize,
+)]
+enum Status {
+    Null = 0,
+    Preparing = 1,
+    Accepting = 2,
+    Committed = 3,
+    Executed = 4,
+}
+
+/// Request batch type (i.e., the "value" in Paxos).
+type ReqBatch = Vec<(ClientId, ApiRequest)>;
+
+/// Leader-side bookkeeping info for each instance initiated.
+#[derive(Debug, Clone)]
+struct LeaderBookkeeping {
+    /// Replicas from which I have received Prepare confirmations.
+    prepare_acks: ReplicaMap,
+
+    /// Max ballot among received Prepare replies.
+    prepare_max_bal: Ballot,
+
+    /// Replicas from which I have received Accept confirmations.
+    accept_acks: ReplicaMap,
+}
+
+/// Follower-side bookkeeping info for each instance received.
+#[derive(Debug, Clone)]
+struct ReplicaBookkeeping {
+    /// Source leader replica ID for replyiing to Prepares and Accepts.
+    source: ReplicaId,
+}
+
+/// In-memory instance containing a complete commands batch.
+#[derive(Debug, Clone)]
+struct Instance {
+    /// Ballot number.
+    bal: Ballot,
+
+    /// Instance status.
+    status: Status,
+
+    /// Shards of a batch of client requests.
+    reqs_cw: RSCodeword<ReqBatch>,
+
+    /// Leader-side bookkeeping info.
+    leader_bk: Option<LeaderBookkeeping>,
+
+    /// Follower-side bookkeeping info.
+    replica_bk: Option<ReplicaBookkeeping>,
+}
+
+/// Stable storage log entry type.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+enum LogEntry {
+    /// Records an update to the largest prepare ballot seen.
+    PrepareBal { slot: usize, ballot: Ballot },
+
+    /// Records a newly accepted request batch data shards at slot index.
+    AcceptData {
+        slot: usize,
+        ballot: Ballot,
+        reqs_cw: RSCodeword<ReqBatch>,
+    },
+
+    /// Records an event of committing the instance at index.
+    CommitSlot { slot: usize },
+}
+
+/// Peer-peer message type.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+enum PeerMsg {
+    /// Prepare message from leader to replicas.
+    Prepare { slot: usize, ballot: Ballot },
+
+    /// Prepare reply from replica to leader.
+    PrepareReply {
+        slot: usize,
+        ballot: Ballot,
+        /// The accepted ballot number for that instance and the corresponding
+        /// request batch value shards known by replica.
+        voted: Option<(Ballot, RSCodeword<ReqBatch>)>,
+    },
+
+    /// Accept message from leader to replicas.
+    Accept {
+        slot: usize,
+        ballot: Ballot,
+        reqs_cw: RSCodeword<ReqBatch>,
+    },
+
+    /// Accept reply from replica to leader.
+    AcceptReply { slot: usize, ballot: Ballot },
+
+    /// Commit notification from leader to replicas.
+    Commit { slot: usize },
+}
+
+/// Crossword server replica module.
+pub struct CrosswordReplica {
+    /// Replica ID in cluster.
+    id: ReplicaId,
+
+    /// Total number of replicas in cluster.
+    population: u8,
+
+    /// Majority quorum size.
+    quorum_cnt: u8,
+
+    /// Configuration parameters struct.
+    config: ReplicaConfigCrossword,
+
+    /// Address string for client requests API.
+    _api_addr: SocketAddr,
+
+    /// Address string for internal peer-peer communication.
+    _p2p_addr: SocketAddr,
+
+    /// ControlHub module.
+    control_hub: ControlHub,
+
+    /// ExternalApi module.
+    external_api: ExternalApi,
+
+    /// StateMachine module.
+    state_machine: StateMachine,
+
+    /// StorageHub module.
+    storage_hub: StorageHub<LogEntry>,
+
+    /// TransportHub module.
+    transport_hub: TransportHub<PeerMsg>,
+
+    /// Do I think I am the leader?
+    is_leader: bool,
+
+    /// In-memory log of instances.
+    insts: Vec<Instance>,
+
+    /// Largest ballot number that a leader has sent Prepare messages in.
+    bal_prep_sent: Ballot,
+
+    /// Largest ballot number that a leader knows has been safely prepared.
+    bal_prepared: Ballot,
+
+    /// Largest ballot number seen as acceptor.
+    bal_max_seen: Ballot,
+
+    /// Index of the first non-committed instance.
+    commit_bar: usize,
+
+    /// Index of the first non-executed instance.
+    /// It is always true that exec_bar <= commit_bar <= insts.len()
+    exec_bar: usize,
+
+    /// Current durable log file offset.
+    log_offset: usize,
+
+    /// Fixed Reed-Solomon coder.
+    rs_coder: ReedSolomon,
+}
+
+impl CrosswordReplica {
+    /// Compose a unique ballot number from base.
+    fn make_unique_ballot(&self, base: u64) -> Ballot {
+        ((base << 8) | ((self.id + 1) as u64)) as Ballot
+    }
+
+    /// Compose a unique ballot number greater than the given one.
+    fn make_greater_ballot(&self, bal: Ballot) -> Ballot {
+        self.make_unique_ballot((bal >> 8) + 1)
+    }
+
+    /// Compose LogActionId from slot index & entry type.
+    /// Uses the `Status` enum type to represent differnet entry types.
+    fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId {
+        let type_num = match entry_type {
+            Status::Preparing => 1,
+            Status::Accepting => 2,
+            Status::Committed => 3,
+            _ => panic!("unknown log entry type {:?}", entry_type),
+        };
+        ((slot << 2) | type_num) as LogActionId
+    }
+
+    /// Decompose LogActionId into slot index & entry type.
+    fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) {
+        let slot = (log_action_id >> 2) as usize;
+        let type_num = log_action_id & ((1 << 2) - 1);
+        let entry_type = match type_num {
+            1 => Status::Preparing,
+            2 => Status::Accepting,
+            3 => Status::Committed,
+            _ => panic!("unknown log entry type num {}", type_num),
+        };
+        (slot, entry_type)
+    }
+
+    /// Compose CommandId from slot index & command index within.
+    fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId {
+        assert!(slot <= (u32::MAX as usize));
+        assert!(cmd_idx <= (u32::MAX as usize));
+        ((slot << 32) | cmd_idx) as CommandId
+    }
+
+    /// Decompose CommandId into slot index & command index within.
+    fn split_command_id(command_id: CommandId) -> (usize, usize) {
+        let slot = (command_id >> 32) as usize;
+        let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
+        (slot, cmd_idx)
+    }
+
+    /// TODO: maybe remove this.
+    fn shards_for_replica(
+        id: ReplicaId,
+        population: u8,
+        num_shards: u8,
+    ) -> HashSet<usize> {
+        (id..(id + num_shards))
+            .map(|i| (i % population) as usize)
+            .collect()
+    }
+
+    /// Handler of client request batch chan recv.
+    fn handle_req_batch(
+        &mut self,
+        req_batch: ReqBatch,
+    ) -> Result<(), SummersetError> {
+        let batch_size = req_batch.len();
+        assert!(batch_size > 0);
+        pf_debug!(self.id; "got request batch of size {}", batch_size);
+
+        // if I'm not a leader, ignore client requests
+        if !self.is_leader {
+            for (client, req) in req_batch {
+                if let ApiRequest::Req { id: req_id, .. } = req {
+                    // tell the client to try on the next replica
+                    let next_replica = (self.id + 1) % self.population;
+                    self.external_api.send_reply(
+                        ApiReply::Reply {
+                            id: req_id,
+                            result: None,
+                            redirect: Some(next_replica),
+                        },
+                        client,
+                    )?;
+                    pf_trace!(self.id; "redirected client {} to replica {}",
+                                       client, next_replica);
+                }
+            }
+            return Ok(());
+        }
+
+        // compute the complete Reed-Solomon codeword for the batch data
+        let mut reqs_cw = RSCodeword::from_data(
+            req_batch,
+            self.quorum_cnt as usize,
+            (self.population - self.quorum_cnt) as usize,
+        )?;
+        reqs_cw.compute_parity(Some(&self.rs_coder))?;
+
+        // create a new instance in the first null slot (or append a new one
+        // at the end if no holes exist)
+        // TODO: maybe use a null_idx variable to better keep track of this
+        let mut slot = self.insts.len();
+        for s in self.commit_bar..self.insts.len() {
+            if self.insts[s].status == Status::Null {
+                slot = s;
+                break;
+            }
+        }
+        if slot < self.insts.len() {
+            let old_inst = &mut self.insts[slot];
+            assert_eq!(old_inst.status, Status::Null);
+            old_inst.reqs_cw = reqs_cw;
+            old_inst.leader_bk = Some(LeaderBookkeeping {
+                prepare_acks: ReplicaMap::new(self.population, false),
+                prepare_max_bal: 0,
+                accept_acks: ReplicaMap::new(self.population, false),
+            });
+        } else {
+            let new_inst = Instance {
+                bal: 0,
+                status: Status::Null,
+                reqs_cw,
+                leader_bk: Some(LeaderBookkeeping {
+                    prepare_acks: ReplicaMap::new(self.population, false),
+                    prepare_max_bal: 0,
+                    accept_acks: ReplicaMap::new(self.population, false),
+                }),
+                replica_bk: None,
+            };
+            self.insts.push(new_inst);
+        }
+
+        // decide whether we can enter fast path for this instance
+        // TODO: remember to reset bal_prepared to 0, update bal_max_seen,
+        //       and re-handle all Preparing & Accepting instances in autonomous
+        //       Prepare initiation
+        if self.bal_prepared == 0 {
+            // slow case: Prepare phase not done yet. Initiate a Prepare round
+            // if none is on the fly, or just wait for some Prepare reply to
+            // trigger my Accept phase
+            if self.bal_prep_sent == 0 {
+                self.bal_prep_sent =
+                    self.make_greater_ballot(self.bal_max_seen);
+                self.bal_max_seen = self.bal_prep_sent;
+            }
+
+            let inst = &mut self.insts[slot];
+            inst.bal = self.bal_prep_sent;
+            inst.status = Status::Preparing;
+            pf_debug!(self.id; "enter Prepare phase for slot {} bal {}",
+                               slot, inst.bal);
+
+            // record update to largest prepare ballot
+            self.storage_hub.submit_action(
+                Self::make_log_action_id(slot, Status::Preparing),
+                LogAction::Append {
+                    entry: LogEntry::PrepareBal {
+                        slot,
+                        ballot: self.bal_prep_sent,
+                    },
+                    sync: self.config.logger_sync,
+                },
+            )?;
+            pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}",
+                               slot, inst.bal);
+
+            // send Prepare messages to all peers
+            self.transport_hub.bcast_msg(
+                PeerMsg::Prepare {
+                    slot,
+                    ballot: self.bal_prep_sent,
+                },
+                None,
+            )?;
+            pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}",
+                               slot, inst.bal);
+        } else {
+            // normal case: Prepare phase covered, only do the Accept phase
+            let inst = &mut self.insts[slot];
+            inst.bal = self.bal_prepared;
+            inst.status = Status::Accepting;
+            pf_debug!(self.id; "enter Accept phase for slot {} bal {}",
+                               slot, inst.bal);
+
+            // record update to largest accepted ballot and corresponding data
+            self.storage_hub.submit_action(
+                Self::make_log_action_id(slot, Status::Accepting),
+                LogAction::Append {
+                    entry: LogEntry::AcceptData {
+                        slot,
+                        ballot: inst.bal,
+                        // persist only some shards on myself
+                        reqs_cw: inst.reqs_cw.subset_copy(
+                            Self::shards_for_replica(
+                                self.id,
+                                self.population,
+                                self.config.shards_per_replica,
+                            ),
+                            false,
+                        )?,
+                    },
+                    sync: self.config.logger_sync,
+                },
+            )?;
+            pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}",
+                               slot, inst.bal);
+
+            // send Accept messages to all peers, each getting its subset of
+            // shards of data
+            for peer in 0..self.population {
+                if peer == self.id {
+                    continue;
+                }
+                self.transport_hub.send_msg(
+                    PeerMsg::Accept {
+                        slot,
+                        ballot: inst.bal,
+                        reqs_cw: inst.reqs_cw.subset_copy(
+                            Self::shards_for_replica(
+                                peer,
+                                self.population,
+                                self.config.shards_per_replica,
+                            ),
+                            false,
+                        )?,
+                    },
+                    peer,
+                )?;
+            }
+            pf_trace!(self.id; "broadcast Accept messages for slot {} bal {}",
+                               slot, inst.bal);
+        }
+
+        Ok(())
+    }
+
+    /// Handler of PrepareBal logging result chan recv.
+    fn handle_logged_prepare_bal(
+        &mut self,
+        slot: usize,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}",
+                           slot, self.insts[slot].bal);
+        let inst = &self.insts[slot];
+        let voted = if inst.status >= Status::Accepting {
+            Some((inst.bal, inst.reqs_cw.clone()))
+        } else {
+            None
+        };
+
+        if self.is_leader {
+            // on leader, finishing the logging of a PrepareBal entry
+            // is equivalent to receiving a Prepare reply from myself
+            // (as an acceptor role)
+            self.handle_msg_prepare_reply(self.id, slot, inst.bal, voted)?;
+        } else {
+            // on follower replica, finishing the logging of a
+            // PrepareBal entry leads to sending back a Prepare reply
+            assert!(inst.replica_bk.is_some());
+            let source = inst.replica_bk.as_ref().unwrap().source;
+            self.transport_hub.send_msg(
+                PeerMsg::PrepareReply {
+                    slot,
+                    ballot: inst.bal,
+                    voted,
+                },
+                source,
+            )?;
+            pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}",
+                               source, slot, inst.bal);
+        }
+
+        Ok(())
+    }
+
+    /// Handler of AcceptData logging result chan recv.
+    fn handle_logged_accept_data(
+        &mut self,
+        slot: usize,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "finished AcceptData logging for slot {} bal {}",
+                           slot, self.insts[slot].bal);
+        let inst = &self.insts[slot];
+
+        if self.is_leader {
+            // on leader, finishing the logging of an AcceptData entry
+            // is equivalent to receiving an Accept reply from myself
+            // (as an acceptor role)
+            self.handle_msg_accept_reply(self.id, slot, inst.bal)?;
+        } else {
+            // on follower replica, finishing the logging of an
+            // AcceptData entry leads to sending back an Accept reply
+            assert!(inst.replica_bk.is_some());
+            let source = inst.replica_bk.as_ref().unwrap().source;
+            self.transport_hub.send_msg(
+                PeerMsg::AcceptReply {
+                    slot,
+                    ballot: inst.bal,
+                },
+                source,
+            )?;
+            pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}",
+                               source, slot, inst.bal);
+        }
+
+        Ok(())
+    }
+
+    /// Handler of CommitSlot logging result chan recv.
+    fn handle_logged_commit_slot(
+        &mut self,
+        slot: usize,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}",
+                                   slot, self.insts[slot].bal);
+        assert!(self.insts[slot].status >= Status::Committed);
+
+        // update index of the first non-committed instance
+        if slot == self.commit_bar {
+            while self.commit_bar < self.insts.len() {
+                let inst = &mut self.insts[self.commit_bar];
+                if inst.status < Status::Committed {
+                    break;
+                }
+
+                if inst.reqs_cw.avail_shards() < self.quorum_cnt as usize {
+                    // can't execute if I don't have the complete request batch
+                    pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})",
+                                       slot, inst.reqs_cw.avail_shards(), self.quorum_cnt);
+                    break;
+                } else if inst.reqs_cw.avail_data_shards()
+                    < self.quorum_cnt as usize
+                {
+                    // have enough shards but need reconstruction
+                    inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
+                }
+                let reqs = inst.reqs_cw.get_data()?;
+
+                // submit commands in committed instance to the state machine
+                // for execution
+                if reqs.is_empty() {
+                    inst.status = Status::Executed;
+                } else if inst.status == Status::Committed {
+                    for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
+                        if let ApiRequest::Req { cmd, .. } = req {
+                            self.state_machine.submit_cmd(
+                                Self::make_command_id(self.commit_bar, cmd_idx),
+                                cmd.clone(),
+                            )?;
+                        } else {
+                            continue; // ignore other types of requests
+                        }
+                    }
+                    pf_trace!(self.id; "submitted {} exec commands for slot {}",
+                                       reqs.len(), self.commit_bar);
+                }
+
+                self.commit_bar += 1;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Synthesized handler of durable logging result chan recv.
+    fn handle_log_result(
+        &mut self,
+        action_id: LogActionId,
+        log_result: LogResult<LogEntry>,
+    ) -> Result<(), SummersetError> {
+        let (slot, entry_type) = Self::split_log_action_id(action_id);
+        assert!(slot < self.insts.len());
+
+        if let LogResult::Append { now_size } = log_result {
+            assert!(now_size >= self.log_offset);
+            self.log_offset = now_size;
+        } else {
+            return logged_err!(self.id; "unexpected log result type: {:?}", log_result);
+        }
+
+        match entry_type {
+            Status::Preparing => self.handle_logged_prepare_bal(slot),
+            Status::Accepting => self.handle_logged_accept_data(slot),
+            Status::Committed => self.handle_logged_commit_slot(slot),
+            _ => {
+                logged_err!(self.id; "unexpected log entry type: {:?}", entry_type)
+            }
+        }
+    }
+
+    /// Handler of Prepare message from leader.
+    fn handle_msg_prepare(
+        &mut self,
+        peer: ReplicaId,
+        slot: usize,
+        ballot: Ballot,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received Prepare <- {} for slot {} bal {}",
+                           peer, slot, ballot);
+
+        // if ballot is not smaller than what I have seen:
+        if ballot >= self.bal_max_seen {
+            // locate instance in memory, filling in null instances if needed
+            while self.insts.len() <= slot {
+                self.insts.push(Instance {
+                    bal: 0,
+                    status: Status::Null,
+                    reqs_cw: RSCodeword::<ReqBatch>::from_null(
+                        self.quorum_cnt as usize,
+                        (self.population - self.quorum_cnt) as usize,
+                    )?,
+                    leader_bk: None,
+                    replica_bk: None,
+                });
+            }
+            let inst = &mut self.insts[slot];
+            assert!(inst.bal <= ballot);
+
+            inst.bal = ballot;
+            inst.status = Status::Preparing;
+            inst.replica_bk = Some(ReplicaBookkeeping { source: peer });
+
+            // update largest ballot seen
+            self.bal_max_seen = ballot;
+
+            // record update to largest prepare ballot
+            self.storage_hub.submit_action(
+                Self::make_log_action_id(slot, Status::Preparing),
+                LogAction::Append {
+                    entry: LogEntry::PrepareBal { slot, ballot },
+                    sync: self.config.logger_sync,
+                },
+            )?;
+            pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}",
+                               slot, ballot);
+        }
+
+        Ok(())
+    }
+
+    /// Handler of Prepare reply from replica.
+    fn handle_msg_prepare_reply(
+        &mut self,
+        peer: ReplicaId,
+        slot: usize,
+        ballot: Ballot,
+        voted: Option<(Ballot, RSCodeword<ReqBatch>)>,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}",
+                           peer, slot, ballot,
+                           voted.as_ref().map(|(_, cw)| cw.avail_shards_set()));
+
+        // if ballot is what I'm currently waiting on for Prepare replies:
+        if ballot == self.bal_prep_sent {
+            assert!(slot < self.insts.len());
+            let inst = &mut self.insts[slot];
+
+            // ignore spurious duplications and outdated replies
+            if (inst.status != Status::Preparing) || (ballot < inst.bal) {
+                return Ok(());
+            }
+            assert_eq!(inst.bal, ballot);
+            assert!(self.bal_max_seen >= ballot);
+            assert!(inst.leader_bk.is_some());
+            let leader_bk = inst.leader_bk.as_mut().unwrap();
+            if leader_bk.prepare_acks.get(peer)? {
+                return Ok(());
+            }
+
+            // bookkeep this Prepare reply
+            leader_bk.prepare_acks.set(peer, true)?;
+            if let Some((bal, val)) = voted {
+                #[allow(clippy::comparison_chain)]
+                if bal > leader_bk.prepare_max_bal {
+                    // is of ballot > current maximum, so discard the current
+                    // codeword and take the replied codeword
+                    leader_bk.prepare_max_bal = bal;
+                    inst.reqs_cw = val;
+                } else if bal == leader_bk.prepare_max_bal {
+                    // is of ballot == the one currently taken, so merge the
+                    // replied codeword into the current one
+                    inst.reqs_cw.absorb_other(val)?;
+                }
+            }
+
+            // if quorum size reached AND enough shards are known to
+            // reconstruct the original data, enter Accept phase for this
+            // instance using the request batch value constructed using shards
+            // with the highest ballot number in quorum
+            if leader_bk.prepare_acks.count() >= self.quorum_cnt
+                && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt as usize
+            {
+                inst.status = Status::Accepting;
+                pf_debug!(self.id; "enter Accept phase for slot {} bal {}",
+                                   slot, inst.bal);
+
+                // update bal_prepared
+                assert!(self.bal_prepared <= ballot);
+                self.bal_prepared = ballot;
+
+                // if parity shards not computed yet, compute them now
+                if inst.reqs_cw.avail_shards() < self.population as usize {
+                    inst.reqs_cw.compute_parity(Some(&self.rs_coder))?;
+                }
+
+                // record update to largest accepted ballot and corresponding data
+                self.storage_hub.submit_action(
+                    Self::make_log_action_id(slot, Status::Accepting),
+                    LogAction::Append {
+                        entry: LogEntry::AcceptData {
+                            slot,
+                            ballot,
+                            reqs_cw: inst.reqs_cw.subset_copy(
+                                Self::shards_for_replica(
+                                    self.id,
+                                    self.population,
+                                    self.config.shards_per_replica,
+                                ),
+                                false,
+                            )?,
+                        },
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}",
+                                   slot, ballot);
+
+                // send Accept messages to all peers
+                for peer in 0..self.population {
+                    if peer == self.id {
+                        continue;
+                    }
+                    self.transport_hub.send_msg(
+                        PeerMsg::Accept {
+                            slot,
+                            ballot,
+                            reqs_cw: inst.reqs_cw.subset_copy(
+                                Self::shards_for_replica(
+                                    peer,
+                                    self.population,
+                                    self.config.shards_per_replica,
+                                ),
+                                false,
+                            )?,
+                        },
+                        peer,
+                    )?;
+                }
+                pf_trace!(self.id; "broadcast Accept messages for slot {} bal {}",
+                                   slot, ballot);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Handler of Accept message from leader.
+    fn handle_msg_accept(
+        &mut self,
+        peer: ReplicaId,
+        slot: usize,
+        ballot: Ballot,
+        reqs_cw: RSCodeword<ReqBatch>,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}",
+                           peer, slot, ballot, reqs_cw.avail_shards_set());
+
+        // if ballot is not smaller than what I have made promises for:
+        if ballot >= self.bal_max_seen {
+            // locate instance in memory, filling in null instances if needed
+            while self.insts.len() <= slot {
+                self.insts.push(Instance {
+                    bal: 0,
+                    status: Status::Null,
+                    reqs_cw: RSCodeword::<ReqBatch>::from_null(
+                        self.quorum_cnt as usize,
+                        (self.population - self.quorum_cnt) as usize,
+                    )?,
+                    leader_bk: None,
+                    replica_bk: None,
+                });
+            }
+            let inst = &mut self.insts[slot];
+            assert!(inst.bal <= ballot);
+
+            inst.bal = ballot;
+            inst.status = Status::Accepting;
+            inst.reqs_cw = reqs_cw;
+            inst.replica_bk = Some(ReplicaBookkeeping { source: peer });
+
+            // update largest ballot seen
+            self.bal_max_seen = ballot;
+
+            // record update to largest prepare ballot
+            self.storage_hub.submit_action(
+                Self::make_log_action_id(slot, Status::Accepting),
+                LogAction::Append {
+                    entry: LogEntry::AcceptData {
+                        slot,
+                        ballot,
+                        reqs_cw: inst.reqs_cw.clone(),
+                    },
+                    sync: self.config.logger_sync,
+                },
+            )?;
+            pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}",
+                               slot, ballot);
+        }
+
+        Ok(())
+    }
+
+    /// Handler of Accept reply from replica.
+    fn handle_msg_accept_reply(
+        &mut self,
+        peer: ReplicaId,
+        slot: usize,
+        ballot: Ballot,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received AcceptReply <- {} for slot {} bal {}",
+                           peer, slot, ballot);
+
+        // if ballot is what I'm currently waiting on for Accept replies:
+        if ballot == self.bal_prepared {
+            assert!(slot < self.insts.len());
+            let inst = &mut self.insts[slot];
+
+            // ignore spurious duplications and outdated replies
+            if (inst.status != Status::Accepting) || (ballot < inst.bal) {
+                return Ok(());
+            }
+            assert_eq!(inst.bal, ballot);
+            assert!(self.bal_max_seen >= ballot);
+            assert!(inst.leader_bk.is_some());
+            let leader_bk = inst.leader_bk.as_mut().unwrap();
+            if leader_bk.accept_acks.get(peer)? {
+                return Ok(());
+            }
+
+            // bookkeep this Accept reply
+            leader_bk.accept_acks.set(peer, true)?;
+
+            // if quorum size reached AND enough number of shards are
+            // remembered, mark this instance as committed; in RS-Paxos, this
+            // means accept_acks.count() >= self.quorum_cnt + fault_tolerance
+            if leader_bk.accept_acks.count()
+                >= self.quorum_cnt + self.config.fault_tolerance
+            {
+                inst.status = Status::Committed;
+                pf_debug!(self.id; "committed instance at slot {} bal {}",
+                                   slot, inst.bal);
+
+                // record commit event
+                self.storage_hub.submit_action(
+                    Self::make_log_action_id(slot, Status::Committed),
+                    LogAction::Append {
+                        entry: LogEntry::CommitSlot { slot },
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                pf_trace!(self.id; "submitted CommitSlot log action for slot {} bal {}",
+                                   slot, inst.bal);
+
+                // send Commit messages to all peers
+                self.transport_hub
+                    .bcast_msg(PeerMsg::Commit { slot }, None)?;
+                pf_trace!(self.id; "broadcast Commit messages for slot {} bal {}",
+                                   slot, ballot);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Handler of Commit message from leader.
+    /// TODO: take care of missing/lost Commit messages
+    fn handle_msg_commit(
+        &mut self,
+        peer: ReplicaId,
+        slot: usize,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received Commit <- {} for slot {}", peer, slot);
+
+        // locate instance in memory, filling in null instances if needed
+        while self.insts.len() <= slot {
+            self.insts.push(Instance {
+                bal: 0,
+                status: Status::Null,
+                reqs_cw: RSCodeword::<ReqBatch>::from_null(
+                    self.quorum_cnt as usize,
+                    (self.population - self.quorum_cnt) as usize,
+                )?,
+                leader_bk: None,
+                replica_bk: None,
+            });
+        }
+        let inst = &mut self.insts[slot];
+
+        // ignore spurious duplications
+        if inst.status != Status::Accepting {
+            return Ok(());
+        }
+
+        // mark this instance as committed
+        inst.status = Status::Committed;
+        pf_debug!(self.id; "committed instance at slot {} bal {}",
+                           slot, inst.bal);
+
+        // record commit event
+        self.storage_hub.submit_action(
+            Self::make_log_action_id(slot, Status::Committed),
+            LogAction::Append {
+                entry: LogEntry::CommitSlot { slot },
+                sync: self.config.logger_sync,
+            },
+        )?;
+        pf_trace!(self.id; "submitted CommitSlot log action for slot {} bal {}",
+                           slot, inst.bal);
+
+        Ok(())
+    }
+
+    /// Synthesized handler of receiving message from peer.
+    fn handle_msg_recv(
+        &mut self,
+        peer: ReplicaId,
+        msg: PeerMsg,
+    ) -> Result<(), SummersetError> {
+        match msg {
+            PeerMsg::Prepare { slot, ballot } => {
+                self.handle_msg_prepare(peer, slot, ballot)
+            }
+            PeerMsg::PrepareReply {
+                slot,
+                ballot,
+                voted,
+            } => self.handle_msg_prepare_reply(peer, slot, ballot, voted),
+            PeerMsg::Accept {
+                slot,
+                ballot,
+                reqs_cw,
+            } => self.handle_msg_accept(peer, slot, ballot, reqs_cw),
+            PeerMsg::AcceptReply { slot, ballot } => {
+                self.handle_msg_accept_reply(peer, slot, ballot)
+            }
+            PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
+        }
+    }
+
+    /// Handler of state machine exec result chan recv.
+    fn handle_cmd_result(
+        &mut self,
+        cmd_id: CommandId,
+        cmd_result: CommandResult,
+    ) -> Result<(), SummersetError> {
+        let (slot, cmd_idx) = Self::split_command_id(cmd_id);
+        assert!(slot < self.insts.len());
+        pf_trace!(self.id; "executed cmd in instance at slot {} idx {}",
+                           slot, cmd_idx);
+
+        let inst = &mut self.insts[slot];
+        let reqs = inst.reqs_cw.get_data()?;
+        assert!(cmd_idx < reqs.len());
+        let (client, ref req) = reqs[cmd_idx];
+
+        // reply command result back to client
+        if let ApiRequest::Req { id: req_id, .. } = req {
+            if self.external_api.has_client(client) {
+                self.external_api.send_reply(
+                    ApiReply::Reply {
+                        id: *req_id,
+                        result: Some(cmd_result),
+                        redirect: None,
+                    },
+                    client,
+                )?;
+                pf_trace!(self.id; "replied -> client {} for slot {} idx {}",
+                                   client, slot, cmd_idx);
+            }
+        } else {
+            return logged_err!(self.id; "unexpected API request type");
+        }
+
+        // if all commands in this instance have been executed, set status to
+        // Executed and update `exec_bar`
+        if cmd_idx == reqs.len() - 1 {
+            inst.status = Status::Executed;
+            pf_debug!(self.id; "executed all cmds in instance at slot {}",
+                               slot);
+
+            // update index of the first non-executed instance
+            if slot == self.exec_bar {
+                while self.exec_bar < self.insts.len() {
+                    let inst = &mut self.insts[self.exec_bar];
+                    if inst.status < Status::Executed {
+                        break;
+                    }
+                    self.exec_bar += 1;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Synthesized handler of manager control messages.
+    fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> {
+        // TODO: fill this when more control message types added
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl GenericReplica for CrosswordReplica {
+    async fn new_and_setup(
+        api_addr: SocketAddr,
+        p2p_addr: SocketAddr,
+        manager: SocketAddr,
+        config_str: Option<&str>,
+    ) -> Result<Self, SummersetError> {
+        let config = parsed_config!(config_str => ReplicaConfigCrossword;
+                                    batch_interval_us, max_batch_size,
+                                    backer_path, logger_sync, fault_tolerance,
+                                    shards_per_replica)?;
+        // connect to the cluster manager and get assigned a server ID
+        let mut control_hub = ControlHub::new_and_setup(manager).await?;
+        let id = control_hub.me;
+
+        if config.batch_interval_us == 0 {
+            return logged_err!(
+                id;
+                "invalid config.batch_interval_us '{}'",
+                config.batch_interval_us
+            );
+        }
+
+        // ask for population number and the list of peers to proactively
+        // connect to
+        control_hub.send_ctrl(CtrlMsg::NewServerJoin {
+            id,
+            protocol: SmrProtocol::Crossword,
+            api_addr,
+            p2p_addr,
+        })?;
+        let (population, to_peers) = if let CtrlMsg::ConnectToPeers {
+            population,
+            to_peers,
+        } = control_hub.recv_ctrl().await?
+        {
+            (population, to_peers)
+        } else {
+            return logged_err!(id; "unexpected ctrl msg type received");
+        };
+
+        // create a Reed-Solomon coder with num_data_shards == quorum size and
+        // num_parity shards == population - quorum
+        let quorum_cnt = (population / 2) + 1;
+        if config.fault_tolerance > (population - quorum_cnt) {
+            return logged_err!(id; "invalid config.fault_tolerance '{}'",
+                                   config.fault_tolerance);
+        }
+        if config.shards_per_replica == 0
+            || config.shards_per_replica > quorum_cnt
+        {
+            return logged_err!(id; "invalid config.shards_per_replica '{}'",
+                                   config.shards_per_replica);
+        }
+        let rs_coder = ReedSolomon::new(
+            quorum_cnt as usize,
+            (population - quorum_cnt) as usize,
+        )?;
+
+        let state_machine = StateMachine::new_and_setup(id).await?;
+
+        let storage_hub =
+            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
+                .await?;
+
+        let mut transport_hub =
+            TransportHub::new_and_setup(id, population, p2p_addr).await?;
+
+        // proactively connect to some peers, then wait for all population
+        // have been connected with me
+        for (peer, addr) in to_peers {
+            transport_hub.connect_to_peer(peer, addr).await?;
+        }
+        transport_hub.wait_for_group(population).await?;
+
+        let external_api = ExternalApi::new_and_setup(
+            id,
+            api_addr,
+            Duration::from_micros(config.batch_interval_us),
+            config.max_batch_size,
+        )
+        .await?;
+
+        Ok(CrosswordReplica {
+            id,
+            population,
+            quorum_cnt,
+            config,
+            _api_addr: api_addr,
+            _p2p_addr: p2p_addr,
+            control_hub,
+            external_api,
+            state_machine,
+            storage_hub,
+            transport_hub,
+            is_leader: false,
+            insts: vec![],
+            bal_prep_sent: 0,
+            bal_prepared: 0,
+            bal_max_seen: 0,
+            commit_bar: 0,
+            exec_bar: 0,
+            log_offset: 0,
+            rs_coder,
+        })
+    }
+
+    async fn run(&mut self) {
+        // TODO: proper leader election
+        if self.id == 0 {
+            self.is_leader = true;
+        }
+
+        loop {
+            tokio::select! {
+                // client request batch
+                req_batch = self.external_api.get_req_batch() => {
+                    if let Err(e) = req_batch {
+                        pf_error!(self.id; "error getting req batch: {}", e);
+                        continue;
+                    }
+                    let req_batch = req_batch.unwrap();
+                    if let Err(e) = self.handle_req_batch(req_batch) {
+                        pf_error!(self.id; "error handling req batch: {}", e);
+                    }
+                },
+
+                // durable logging result
+                log_result = self.storage_hub.get_result() => {
+                    if let Err(e) = log_result {
+                        pf_error!(self.id; "error getting log result: {}", e);
+                        continue;
+                    }
+                    let (action_id, log_result) = log_result.unwrap();
+                    if let Err(e) = self.handle_log_result(action_id, log_result) {
+                        pf_error!(self.id; "error handling log result {}: {}",
+                                           action_id, e);
+                    }
+                },
+
+                // message from peer
+                msg = self.transport_hub.recv_msg() => {
+                    if let Err(e) = msg {
+                        pf_error!(self.id; "error receiving peer msg: {}", e);
+                        continue;
+                    }
+                    let (peer, msg) = msg.unwrap();
+                    if let Err(e) = self.handle_msg_recv(peer, msg) {
+                        pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e);
+                    }
+                }
+
+                // state machine execution result
+                cmd_result = self.state_machine.get_result() => {
+                    if let Err(e) = cmd_result {
+                        pf_error!(self.id; "error getting cmd result: {}", e);
+                        continue;
+                    }
+                    let (cmd_id, cmd_result) = cmd_result.unwrap();
+                    if let Err(e) = self.handle_cmd_result(cmd_id, cmd_result) {
+                        pf_error!(self.id; "error handling cmd result {}: {}", cmd_id, e);
+                    }
+                },
+
+                // manager control message
+                ctrl_msg = self.control_hub.recv_ctrl() => {
+                    if let Err(e) = ctrl_msg {
+                        pf_error!(self.id; "error getting ctrl msg: {}", e);
+                        continue;
+                    }
+                    let ctrl_msg = ctrl_msg.unwrap();
+                    if let Err(e) = self.handle_ctrl_msg(ctrl_msg) {
+                        pf_error!(self.id; "error handling ctrl msg: {}", e);
+                    }
+                }
+            }
+        }
+    }
+}
+
+/// Configuration parameters struct.
+#[derive(Debug, Deserialize)]
+pub struct ClientConfigCrossword {
+    /// Which server to pick initially.
+    pub init_server_id: ReplicaId,
+}
+
+#[allow(clippy::derivable_impls)]
+impl Default for ClientConfigCrossword {
+    fn default() -> Self {
+        ClientConfigCrossword { init_server_id: 0 }
+    }
+}
+
+/// Crossword client-side module.
+pub struct CrosswordClient {
+    /// Client ID.
+    id: ClientId,
+
+    /// Address of the cluster manager oracle.
+    manager: SocketAddr,
+
+    /// Configuration parameters struct.
+    _config: ClientConfigCrossword,
+
+    /// Cached list of active servers information.
+    servers: HashMap<ReplicaId, SocketAddr>,
+
+    /// Current server ID to connect to.
+    server_id: ReplicaId,
+
+    /// Control API stub to the cluster manager.
+    ctrl_stub: Option<ClientCtrlStub>,
+
+    /// API stubs for communicating with servers.
+    api_stub: Option<ClientApiStub>,
+}
+
+#[async_trait]
+impl GenericEndpoint for CrosswordClient {
+    fn new(
+        manager: SocketAddr,
+        config_str: Option<&str>,
+    ) -> Result<Self, SummersetError> {
+        let config = parsed_config!(config_str => ClientConfigCrossword;
+                                    init_server_id)?;
+        let init_server_id = config.init_server_id;
+
+        Ok(CrosswordClient {
+            id: 255, // nil at this time
+            manager,
+            _config: config,
+            servers: HashMap::new(),
+            server_id: init_server_id,
+            ctrl_stub: None,
+            api_stub: None,
+        })
+    }
+
+    async fn connect(&mut self) -> Result<ClientId, SummersetError> {
+        // disallow reconnection without leaving
+        if self.api_stub.is_some() {
+            return logged_err!(self.id; "reconnecting without leaving");
+        }
+
+        // if ctrl_stubs not established yet, connect to the manager
+        if self.ctrl_stub.is_none() {
+            let ctrl_stub =
+                ClientCtrlStub::new_by_connect(self.manager).await?;
+            self.id = ctrl_stub.id;
+            self.ctrl_stub = Some(ctrl_stub);
+        }
+        let ctrl_stub = self.ctrl_stub.as_mut().unwrap();
+
+        // ask the manager about the list of active servers
+        let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
+        while !sent {
+            sent = ctrl_stub.send_req(None)?;
+        }
+
+        let reply = ctrl_stub.recv_reply().await?;
+        match reply {
+            CtrlReply::QueryInfo { servers } => {
+                // connect to the one with server ID in config
+                let api_stub = ClientApiStub::new_by_connect(
+                    self.id,
+                    servers[&self.server_id],
+                )
+                .await?;
+                self.api_stub = Some(api_stub);
+                self.servers = servers;
+                Ok(self.id)
+            }
+            _ => logged_err!(self.id; "unexpected reply type received"),
+        }
+    }
+
+    async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> {
+        // send leave notification to current connected server
+        if let Some(mut api_stub) = self.api_stub.take() {
+            let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?;
+            while !sent {
+                sent = api_stub.send_req(None)?;
+            }
+
+            let reply = api_stub.recv_reply().await?;
+            match reply {
+                ApiReply::Leave => {
+                    pf_info!(self.id; "left current server connection");
+                    api_stub.forget();
+                }
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
+                }
+            }
+        }
+
+        // if permanently leaving, send leave notification to the manager
+        if permanent {
+            // disallow multiple permanent leaving
+            if self.ctrl_stub.is_none() {
+                return logged_err!(self.id; "repeated permanent leaving");
+            }
+
+            if let Some(mut ctrl_stub) = self.ctrl_stub.take() {
+                let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
+                while !sent {
+                    sent = ctrl_stub.send_req(None)?;
+                }
+
+                let reply = ctrl_stub.recv_reply().await?;
+                match reply {
+                    CtrlReply::Leave => {
+                        pf_info!(self.id; "left current manager connection");
+                        ctrl_stub.forget();
+                    }
+                    _ => {
+                        return logged_err!(self.id; "unexpected reply type received");
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn send_req(
+        &mut self,
+        req: Option<&ApiRequest>,
+    ) -> Result<bool, SummersetError> {
+        match self.api_stub {
+            Some(ref mut api_stub) => api_stub.send_req(req),
+            None => logged_err!(self.id; "client is not set up"),
+        }
+    }
+
+    async fn recv_reply(&mut self) -> Result<ApiReply, SummersetError> {
+        match self.api_stub {
+            Some(ref mut api_stub) => {
+                let reply = api_stub.recv_reply().await?;
+
+                if let ApiReply::Reply {
+                    ref result,
+                    ref redirect,
+                    ..
+                } = reply
+                {
+                    // if the current server redirects me to a different server
+                    if result.is_none() && redirect.is_some() {
+                        let redirect_id = redirect.unwrap();
+                        assert!(self.servers.contains_key(&redirect_id));
+                        self.leave(false).await?;
+                        self.server_id = redirect_id;
+                        self.connect().await?;
+                        pf_debug!(self.id; "redirected to replica {} '{}'",
+                                           redirect_id, self.servers[&redirect_id]);
+                    }
+                }
+
+                Ok(reply)
+            }
+            None => logged_err!(self.id; "client is not set up"),
+        }
+    }
+}
diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs
index 36d0ea1a..98ecf371 100644
--- a/src/protocols/mod.rs
+++ b/src/protocols/mod.rs
@@ -26,6 +26,10 @@ mod rs_paxos;
 use rs_paxos::{RSPaxosReplica, RSPaxosClient};
 pub use rs_paxos::{ReplicaConfigRSPaxos, ClientConfigRSPaxos};
 
+mod crossword;
+use crossword::{CrosswordReplica, CrosswordClient};
+pub use crossword::{ReplicaConfigCrossword, ClientConfigCrossword};
+
 /// Enum of supported replication protocol types.
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
 pub enum SmrProtocol {
@@ -33,6 +37,7 @@ pub enum SmrProtocol {
     SimplePush,
     MultiPaxos,
     RSPaxos,
+    Crossword,
 }
 
 /// Helper macro for saving boilder-plate `Box<dyn ..>` mapping in
@@ -52,6 +57,7 @@ impl SmrProtocol {
             "SimplePush" => Some(Self::SimplePush),
             "MultiPaxos" => Some(Self::MultiPaxos),
             "RSPaxos" => Some(Self::RSPaxos),
+            "Crossword" => Some(Self::Crossword),
             _ => None,
         }
     }
@@ -108,6 +114,14 @@ impl SmrProtocol {
                     .await
                 )
             }
+            Self::Crossword => {
+                box_if_ok!(
+                    CrosswordReplica::new_and_setup(
+                        api_addr, p2p_addr, manager, config_str
+                    )
+                    .await
+                )
+            }
         }
     }
 
@@ -130,6 +144,9 @@ impl SmrProtocol {
             Self::RSPaxos => {
                 box_if_ok!(RSPaxosClient::new(manager, config_str))
             }
+            Self::Crossword => {
+                box_if_ok!(CrosswordClient::new(manager, config_str))
+            }
         }
     }
 }
@@ -159,6 +176,7 @@ mod protocols_name_tests {
         valid_name_test!(SimplePush);
         valid_name_test!(MultiPaxos);
         valid_name_test!(RSPaxos);
+        valid_name_test!(Crossword);
     }
 
     #[test]
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 1dade1c0..e47993de 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -417,7 +417,7 @@ impl RSPaxosReplica {
             pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}",
                                slot, inst.bal);
 
-            // send Accept messages to all peers, each getting on shard of data
+            // send Accept messages to all peers, each getting one shard of data
             for peer in 0..self.population {
                 if peer == self.id {
                     continue;

From 2eb416c7b0452cb091f6ff43b1a2cfe6da6fa06c Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Tue, 29 Aug 2023 14:24:47 +0800
Subject: [PATCH 04/89] updates to benchmark scripts

---
 scripts/local_bench.tmp.py   | 76 +++++++++++++++++++++++++++++------
 scripts/local_client.py      | 22 ++++-------
 scripts/local_cluster.py     | 77 ++++++++++++++++++++++++++----------
 scripts/set_tcp_buf_sizes.sh | 10 +++++
 4 files changed, 138 insertions(+), 47 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 6f1ceb78..c20ad6d5 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -1,5 +1,5 @@
 import subprocess
-import time
+import itertools
 import statistics
 
 
@@ -24,7 +24,7 @@ def kill_all_matching(name):
     proc.wait()
 
 
-def launch_cluster(protocol, num_replicas):
+def launch_cluster(protocol, num_replicas, config):
     cmd = [
         "python3",
         "./scripts/local_cluster.py",
@@ -34,9 +34,25 @@ def launch_cluster(protocol, num_replicas):
         str(num_replicas),
         "-r",
     ]
+    if config is not None and len(config) > 0:
+        cmd += ["--config", config]
     return run_process(cmd)
 
 
+def wait_cluster_setup(proc, num_replicas):
+    accepting_clients = [False for _ in range(num_replicas)]
+
+    for line in iter(proc.stderr.readline, b""):
+        l = line.decode()
+        if "manager" not in l and "accepting clients" in l:
+            replica = int(l[l.find("(") + 1 : l.find(")")])
+            assert not accepting_clients[replica]
+            accepting_clients[replica] = True
+
+        if accepting_clients.count(True) == num_replicas:
+            break
+
+
 def run_bench_client(protocol, value_size, put_ratio, length_s):
     cmd = [
         "python3",
@@ -84,7 +100,15 @@ def parse_output(output):
     print(f"  std  tpt {std_tpt:9.2f}         lat {std_lat:9.2f}")
 
 
-def bench_round(protocol, num_replicas, value_size, put_ratio, length_s):
+def bench_round(
+    protocol,
+    num_replicas,
+    value_size,
+    put_ratio,
+    length_s,
+    fault_tolerance=None,
+    shards_per_replica=None,
+):
     print(
         f"{protocol:<10s}  n={num_replicas:1d}  v={value_size:<9d}  w%={put_ratio:<3d}  {length_s:3d}s"
     )
@@ -92,8 +116,13 @@ def bench_round(protocol, num_replicas, value_size, put_ratio, length_s):
     kill_all_matching("summerset_server")
     kill_all_matching("summerset_manager")
 
-    proc_cluster = launch_cluster(protocol, num_replicas)
-    time.sleep(15)
+    configs = []
+    if fault_tolerance is not None:
+        configs.append(f"fault_tolerance={fault_tolerance}")
+    if shards_per_replica is not None:
+        configs.append(f"shards_per_replica={shards_per_replica}")
+    proc_cluster = launch_cluster(protocol, num_replicas, "+".join(configs))
+    wait_cluster_setup(proc_cluster, num_replicas)
 
     proc_client = run_bench_client(protocol, value_size, put_ratio, length_s)
     out, err = proc_client.communicate()
@@ -110,10 +139,33 @@ def bench_round(protocol, num_replicas, value_size, put_ratio, length_s):
 if __name__ == "__main__":
     do_cargo_build()
 
-    for num_replicas in (3, 5, 7):
-        for value_size in (1024, 65536, 4194304):
-            for protocol in ("MultiPaxos", "RSPaxos"):
-                bench_round(protocol, num_replicas, value_size, 100, 60)
-
-    bench_round("MultiPaxos", 7, 4194304, 10, 60)
-    bench_round("RSPaxos", 7, 4194304, 10, 60)
+    def all_protocol_configs(num_replicas):
+        quorum_cnt = num_replicas // 2 + 1
+        max_fault_tolerance = num_replicas - quorum_cnt
+
+        config_choices = [("MultiPaxos", None, None)]
+        for shards_per_replica in range(quorum_cnt, 0):
+            config_choices.append(
+                ("Crossword", max_fault_tolerance, shards_per_replica)
+            )
+        config_choices.append(("Crossword", 0, 1))
+
+        return config_choices
+
+    # for num_replicas in (3, 5, 7):
+    #     for value_size in (1024, 65536, 4194304):
+    #         for protocol, fault_tolerance, shards_per_replica in all_protocol_configs(
+    #             num_replicas
+    #         ):
+    #             bench_round(
+    #                 protocol,
+    #                 num_replicas,
+    #                 value_size,
+    #                 100,
+    #                 60,
+    #                 fault_tolerance=fault_tolerance,
+    #                 shards_per_replica=shards_per_replica,
+    #             )
+
+    bench_round("MultiPaxos", 5, 65536, 0, 60)
+    # bench_round("Crossword", 5, 65536, 0, 60, fault_tolerance=0, shards_per_replica=1)
diff --git a/scripts/local_client.py b/scripts/local_client.py
index 2f9c2c6d..f8ac5981 100644
--- a/scripts/local_client.py
+++ b/scripts/local_client.py
@@ -21,15 +21,6 @@ def run_process(cmd):
 MANAGER_CLI_PORT = 52601
 
 
-PROTOCOL_CONFIGS = {
-    "RepNothing": "",
-    "SimplePush": "",
-    "MultiPaxos": "",
-    "RSPaxos": "",
-    "Crossword": "",
-}
-
-
 UTILITY_PARAM_NAMES = {
     "repl": [],
     "bench": ["freq_target", "value_size", "put_ratio", "length_s"],
@@ -63,7 +54,7 @@ def compose_client_cmd(protocol, manager, config, utility, params, release):
         "-m",
         manager,
     ]
-    if len(config) > 0:
+    if config is not None and len(config) > 0:
         cmd += ["--config", config]
 
     cmd += ["-u", utility]
@@ -77,11 +68,11 @@ def compose_client_cmd(protocol, manager, config, utility, params, release):
     return cmd
 
 
-def run_client(protocol, utility, params, release):
+def run_client(protocol, utility, params, release, config):
     cmd = compose_client_cmd(
         protocol,
         f"127.0.0.1:{MANAGER_CLI_PORT}",
-        PROTOCOL_CONFIGS[protocol],
+        config,
         utility,
         params,
         release,
@@ -97,6 +88,9 @@ def run_client(protocol, utility, params, release):
         "-p", "--protocol", type=str, required=True, help="protocol name"
     )
     parser.add_argument("-r", "--release", action="store_true", help="run release mode")
+    parser.add_argument(
+        "-c", "--config", type=str, help="protocol-specific TOML config string"
+    )
 
     subparsers = parser.add_subparsers(
         required=True,
@@ -129,9 +123,6 @@ def run_client(protocol, utility, params, release):
 
     args = parser.parse_args()
 
-    if args.protocol not in PROTOCOL_CONFIGS:
-        raise ValueError(f"unknown protocol name '{args.protocol}'")
-
     # build everything
     do_cargo_build(args.release)
 
@@ -141,6 +132,7 @@ def run_client(protocol, utility, params, release):
         args.utility,
         glue_params_str(args, UTILITY_PARAM_NAMES[args.utility]),
         args.release,
+        args.config,
     )
 
     rc = client_proc.wait()
diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index b7dcdb25..ffbdff15 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -1,7 +1,6 @@
 import sys
 import argparse
 import subprocess
-import time
 from pathlib import Path
 
 
@@ -14,9 +13,13 @@ def do_cargo_build(release):
     proc.wait()
 
 
-def run_process(cmd):
+def run_process(cmd, capture_stderr=False):
     print("Run:", " ".join(cmd))
-    proc = subprocess.Popen(cmd)
+    proc = None
+    if capture_stderr:
+        proc = subprocess.Popen(cmd, stderr=subprocess.PIPE)
+    else:
+        proc = subprocess.Popen(cmd)
     return proc
 
 
@@ -35,15 +38,28 @@ def kill_all_matching(name):
 SERVER_P2P_PORT = lambda r: 52800 + r
 
 
-PROTOCOL_CONFIGS = {
-    "RepNothing": lambda r, n: f"backer_path='/tmp/summerset.rep_nothing.{r}.wal'",
-    "SimplePush": lambda r, n: f"backer_path='/tmp/summerset.simple_push.{r}.wal'+rep_degree={n-1}",
-    "MultiPaxos": lambda r, n: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'",
-    "RSPaxos": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance={n-(n//2+1)}",
-    "Crossword": lambda r, n: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'+fault_tolerance=0+shards_per_replica=3",
+PROTOCOL_BACKER_PATH = {
+    "RepNothing": lambda r: f"backer_path='/tmp/summerset.rep_nothing.{r}.wal'",
+    "SimplePush": lambda r: f"backer_path='/tmp/summerset.simple_push.{r}.wal'",
+    "MultiPaxos": lambda r: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'",
+    "RSPaxos": lambda r: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'",
+    "Crossword": lambda r: f"backer_path='/tmp/summerset.crossword.{r}.wal'",
 }
 
 
+def config_with_backer_path(protocol, config, replica):
+    result_config = PROTOCOL_BACKER_PATH[protocol](replica)
+
+    if config is not None and len(config) > 0:
+        if "backer_path" in config:
+            result_config = config  # use user-supplied path
+        else:
+            result_config += "+"
+            result_config += config
+
+    return result_config
+
+
 def compose_manager_cmd(protocol, srv_port, cli_port, num_replicas, release):
     cmd = [f"./target/{'release' if release else 'debug'}/summerset_manager"]
     cmd += [
@@ -67,7 +83,26 @@ def launch_manager(protocol, num_replicas, release):
         num_replicas,
         release,
     )
-    return run_process(cmd)
+    return run_process(cmd, capture_stderr=True)
+
+
+def wait_manager_setup(proc):
+    accepting_servers, accepting_clients = False, False
+
+    for line in iter(proc.stderr.readline, b""):
+        sys.stderr.buffer.write(line)
+        sys.stderr.flush()
+
+        l = line.decode()
+        if "(m) accepting servers" in l:
+            assert not accepting_servers
+            accepting_servers = True
+        if "(m) accepting clients" in l:
+            assert not accepting_clients
+            accepting_clients = True
+
+        if accepting_servers and accepting_clients:
+            break
 
 
 def compose_server_cmd(protocol, api_port, p2p_port, manager, config, release):
@@ -82,12 +117,12 @@ def compose_server_cmd(protocol, api_port, p2p_port, manager, config, release):
         "-m",
         manager,
     ]
-    if len(config) > 0:
+    if config is not None and len(config) > 0:
         cmd += ["--config", config]
     return cmd
 
 
-def launch_servers(protocol, num_replicas, release):
+def launch_servers(protocol, num_replicas, release, config):
     server_procs = []
     for replica in range(num_replicas):
         cmd = compose_server_cmd(
@@ -95,7 +130,7 @@ def launch_servers(protocol, num_replicas, release):
             SERVER_API_PORT(replica),
             SERVER_P2P_PORT(replica),
             f"127.0.0.1:{MANAGER_SRV_PORT}",
-            PROTOCOL_CONFIGS[protocol](replica, num_replicas),
+            config_with_backer_path(protocol, config, replica),
             release,
         )
         proc = run_process(cmd)
@@ -115,13 +150,11 @@ def launch_servers(protocol, num_replicas, release):
     parser.add_argument(
         "-r", "--release", action="store_true", help="if set, run release mode"
     )
+    parser.add_argument(
+        "-c", "--config", type=str, help="protocol-specific TOML config string"
+    )
     args = parser.parse_args()
 
-    if args.protocol not in PROTOCOL_CONFIGS:
-        raise ValueError(f"unknown protocol name '{args.protocol}'")
-    if args.num_replicas <= 0 or args.num_replicas > 9:
-        raise ValueError(f"invalid number of replicas {args.num_replicas}")
-
     # kill all existing server and manager processes
     kill_all_matching("summerset_server")
     kill_all_matching("summerset_manager")
@@ -135,10 +168,14 @@ def launch_servers(protocol, num_replicas, release):
 
     # launch cluster manager oracle first
     manager_proc = launch_manager(args.protocol, args.num_replicas, args.release)
-    time.sleep(5)
+    wait_manager_setup(manager_proc)
 
     # then launch server replicas
-    launch_servers(args.protocol, args.num_replicas, args.release)
+    launch_servers(args.protocol, args.num_replicas, args.release, args.config)
+
+    for line in iter(manager_proc.stderr.readline, b""):
+        sys.stderr.buffer.write(line)
+        sys.stderr.flush()
 
     rc = manager_proc.wait()
     sys.exit(rc)
diff --git a/scripts/set_tcp_buf_sizes.sh b/scripts/set_tcp_buf_sizes.sh
index 2d3e3f21..55d8d0a4 100755
--- a/scripts/set_tcp_buf_sizes.sh
+++ b/scripts/set_tcp_buf_sizes.sh
@@ -1,12 +1,22 @@
 #! /usr/bin/bash
 
+echo "Per-socket TCP send/receive buffer:"
+echo "min default max"
 echo "4096 131072 33554432" | sudo tee /proc/sys/net/ipv4/tcp_rmem
 echo "4096 131072 33554432" | sudo tee /proc/sys/net/ipv4/tcp_wmem
+echo
 
+echo "System-wide total buffer size:"
+echo "min default max"
 echo "1538757 16413408 24620112" | sudo tee /proc/sys/net/ipv4/tcp_mem
+echo
 
+echo "Max value of setsockopt:"
 echo "33554432" | sudo tee /proc/sys/net/core/rmem_max
 echo "33554432" | sudo tee /proc/sys/net/core/wmem_max
+echo
 
+echo "Default value of network socket:"
 echo "131072" | sudo tee /proc/sys/net/core/rmem_default
 echo "131072" | sudo tee /proc/sys/net/core/wmem_default
+echo

From f9960edec58fe55dddbd0e2e2346636c84411c90 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Tue, 29 Aug 2023 16:44:12 +0800
Subject: [PATCH 05/89] make Bitmap a general u8-indexed map

---
 src/lib.rs                   |   2 +-
 src/protocols/crossword.rs   |  53 +++++++-------
 src/protocols/multipaxos.rs  |  14 ++--
 src/protocols/rs_paxos.rs    |  54 +++++++-------
 src/protocols/simple_push.rs |   8 +--
 src/server/transport.rs      |  12 ++--
 src/utils/bitmap.rs          |  86 +++++++++++++++-------
 src/utils/mod.rs             |   2 +-
 src/utils/rscoding.rs        | 134 ++++++++++++++++++++---------------
 9 files changed, 210 insertions(+), 155 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 40bcbf31..24a24bb6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -13,7 +13,7 @@ mod protocols;
 // Things (other than exported macros) exposed to users of this crate:
 
 #[doc(inline)]
-pub use crate::utils::{SummersetError, ReplicaMap, Timer};
+pub use crate::utils::{SummersetError, Bitmap, Timer};
 
 #[doc(inline)]
 pub use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply, ClusterManager};
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index dfe647ea..7f6bc743 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -3,11 +3,11 @@
 //! MultiPaxos with flexible Reed-Solomon erasure coding that supports tunable
 //! shard groups and asymmetric shard assignment.
 
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::path::Path;
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, ReplicaMap, RSCodeword};
+use crate::utils::{SummersetError, Bitmap, RSCodeword};
 use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
 use crate::server::{
     ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi,
@@ -84,13 +84,13 @@ type ReqBatch = Vec<(ClientId, ApiRequest)>;
 #[derive(Debug, Clone)]
 struct LeaderBookkeeping {
     /// Replicas from which I have received Prepare confirmations.
-    prepare_acks: ReplicaMap,
+    prepare_acks: Bitmap,
 
     /// Max ballot among received Prepare replies.
     prepare_max_bal: Ballot,
 
     /// Replicas from which I have received Accept confirmations.
-    accept_acks: ReplicaMap,
+    accept_acks: Bitmap,
 }
 
 /// Follower-side bookkeeping info for each instance received.
@@ -284,10 +284,9 @@ impl CrosswordReplica {
         id: ReplicaId,
         population: u8,
         num_shards: u8,
-    ) -> HashSet<usize> {
-        (id..(id + num_shards))
-            .map(|i| (i % population) as usize)
-            .collect()
+    ) -> Bitmap {
+        let ones = (id..(id + num_shards)).map(|i| (i % population)).collect();
+        Bitmap::from(population, ones)
     }
 
     /// Handler of client request batch chan recv.
@@ -323,8 +322,8 @@ impl CrosswordReplica {
         // compute the complete Reed-Solomon codeword for the batch data
         let mut reqs_cw = RSCodeword::from_data(
             req_batch,
-            self.quorum_cnt as usize,
-            (self.population - self.quorum_cnt) as usize,
+            self.quorum_cnt,
+            self.population - self.quorum_cnt,
         )?;
         reqs_cw.compute_parity(Some(&self.rs_coder))?;
 
@@ -343,9 +342,9 @@ impl CrosswordReplica {
             assert_eq!(old_inst.status, Status::Null);
             old_inst.reqs_cw = reqs_cw;
             old_inst.leader_bk = Some(LeaderBookkeeping {
-                prepare_acks: ReplicaMap::new(self.population, false),
+                prepare_acks: Bitmap::new(self.population, false),
                 prepare_max_bal: 0,
-                accept_acks: ReplicaMap::new(self.population, false),
+                accept_acks: Bitmap::new(self.population, false),
             });
         } else {
             let new_inst = Instance {
@@ -353,9 +352,9 @@ impl CrosswordReplica {
                 status: Status::Null,
                 reqs_cw,
                 leader_bk: Some(LeaderBookkeeping {
-                    prepare_acks: ReplicaMap::new(self.population, false),
+                    prepare_acks: Bitmap::new(self.population, false),
                     prepare_max_bal: 0,
-                    accept_acks: ReplicaMap::new(self.population, false),
+                    accept_acks: Bitmap::new(self.population, false),
                 }),
                 replica_bk: None,
             };
@@ -555,14 +554,12 @@ impl CrosswordReplica {
                     break;
                 }
 
-                if inst.reqs_cw.avail_shards() < self.quorum_cnt as usize {
+                if inst.reqs_cw.avail_shards() < self.quorum_cnt {
                     // can't execute if I don't have the complete request batch
                     pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})",
                                        slot, inst.reqs_cw.avail_shards(), self.quorum_cnt);
                     break;
-                } else if inst.reqs_cw.avail_data_shards()
-                    < self.quorum_cnt as usize
-                {
+                } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
                     // have enough shards but need reconstruction
                     inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
                 }
@@ -638,8 +635,8 @@ impl CrosswordReplica {
                     bal: 0,
                     status: Status::Null,
                     reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                        self.quorum_cnt as usize,
-                        (self.population - self.quorum_cnt) as usize,
+                        self.quorum_cnt,
+                        self.population - self.quorum_cnt,
                     )?,
                     leader_bk: None,
                     replica_bk: None,
@@ -680,7 +677,7 @@ impl CrosswordReplica {
     ) -> Result<(), SummersetError> {
         pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}",
                            peer, slot, ballot,
-                           voted.as_ref().map(|(_, cw)| cw.avail_shards_set()));
+                           voted.as_ref().map(|(_, cw)| cw.avail_shards_map()));
 
         // if ballot is what I'm currently waiting on for Prepare replies:
         if ballot == self.bal_prep_sent {
@@ -720,7 +717,7 @@ impl CrosswordReplica {
             // instance using the request batch value constructed using shards
             // with the highest ballot number in quorum
             if leader_bk.prepare_acks.count() >= self.quorum_cnt
-                && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt as usize
+                && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt
             {
                 inst.status = Status::Accepting;
                 pf_debug!(self.id; "enter Accept phase for slot {} bal {}",
@@ -731,7 +728,7 @@ impl CrosswordReplica {
                 self.bal_prepared = ballot;
 
                 // if parity shards not computed yet, compute them now
-                if inst.reqs_cw.avail_shards() < self.population as usize {
+                if inst.reqs_cw.avail_shards() < self.population {
                     inst.reqs_cw.compute_parity(Some(&self.rs_coder))?;
                 }
 
@@ -795,7 +792,7 @@ impl CrosswordReplica {
         reqs_cw: RSCodeword<ReqBatch>,
     ) -> Result<(), SummersetError> {
         pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}",
-                           peer, slot, ballot, reqs_cw.avail_shards_set());
+                           peer, slot, ballot, reqs_cw.avail_shards_map());
 
         // if ballot is not smaller than what I have made promises for:
         if ballot >= self.bal_max_seen {
@@ -805,8 +802,8 @@ impl CrosswordReplica {
                     bal: 0,
                     status: Status::Null,
                     reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                        self.quorum_cnt as usize,
-                        (self.population - self.quorum_cnt) as usize,
+                        self.quorum_cnt,
+                        self.population - self.quorum_cnt,
                     )?,
                     leader_bk: None,
                     replica_bk: None,
@@ -919,8 +916,8 @@ impl CrosswordReplica {
                 bal: 0,
                 status: Status::Null,
                 reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                    self.quorum_cnt as usize,
-                    (self.population - self.quorum_cnt) as usize,
+                    self.quorum_cnt,
+                    self.population - self.quorum_cnt,
                 )?,
                 leader_bk: None,
                 replica_bk: None,
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index c268372c..d44056f1 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -11,7 +11,7 @@ use std::collections::HashMap;
 use std::path::Path;
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, ReplicaMap};
+use crate::utils::{SummersetError, Bitmap};
 use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
 use crate::server::{
     ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi,
@@ -77,13 +77,13 @@ type ReqBatch = Vec<(ClientId, ApiRequest)>;
 #[derive(Debug, Clone)]
 struct LeaderBookkeeping {
     /// Replicas from which I have received Prepare confirmations.
-    prepare_acks: ReplicaMap,
+    prepare_acks: Bitmap,
 
     /// Max ballot among received Prepare replies.
     prepare_max_bal: Ballot,
 
     /// Replicas from which I have received Accept confirmations.
-    accept_acks: ReplicaMap,
+    accept_acks: Bitmap,
 }
 
 /// Follower-side bookkeeping info for each instance received.
@@ -308,9 +308,9 @@ impl MultiPaxosReplica {
             if old_inst.status == Status::Null {
                 old_inst.reqs = req_batch.clone();
                 old_inst.leader_bk = Some(LeaderBookkeeping {
-                    prepare_acks: ReplicaMap::new(self.population, false),
+                    prepare_acks: Bitmap::new(self.population, false),
                     prepare_max_bal: 0,
-                    accept_acks: ReplicaMap::new(self.population, false),
+                    accept_acks: Bitmap::new(self.population, false),
                 });
                 slot = s;
                 break;
@@ -322,9 +322,9 @@ impl MultiPaxosReplica {
                 status: Status::Null,
                 reqs: req_batch.clone(),
                 leader_bk: Some(LeaderBookkeeping {
-                    prepare_acks: ReplicaMap::new(self.population, false),
+                    prepare_acks: Bitmap::new(self.population, false),
                     prepare_max_bal: 0,
-                    accept_acks: ReplicaMap::new(self.population, false),
+                    accept_acks: Bitmap::new(self.population, false),
                 }),
                 replica_bk: None,
             };
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index e47993de..b2da668d 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -3,11 +3,11 @@
 //! MultiPaxos with Reed-Solomon erasure coding. References:
 //!   - <https://madsys.cs.tsinghua.edu.cn/publications/HPDC2014-mu.pdf>
 
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::path::Path;
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, ReplicaMap, RSCodeword};
+use crate::utils::{SummersetError, Bitmap, RSCodeword};
 use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
 use crate::server::{
     ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi,
@@ -79,13 +79,13 @@ type ReqBatch = Vec<(ClientId, ApiRequest)>;
 #[derive(Debug, Clone)]
 struct LeaderBookkeeping {
     /// Replicas from which I have received Prepare confirmations.
-    prepare_acks: ReplicaMap,
+    prepare_acks: Bitmap,
 
     /// Max ballot among received Prepare replies.
     prepare_max_bal: Ballot,
 
     /// Replicas from which I have received Accept confirmations.
-    accept_acks: ReplicaMap,
+    accept_acks: Bitmap,
 }
 
 /// Follower-side bookkeeping info for each instance received.
@@ -307,8 +307,8 @@ impl RSPaxosReplica {
         // compute the complete Reed-Solomon codeword for the batch data
         let mut reqs_cw = RSCodeword::from_data(
             req_batch,
-            self.quorum_cnt as usize,
-            (self.population - self.quorum_cnt) as usize,
+            self.quorum_cnt,
+            self.population - self.quorum_cnt,
         )?;
         reqs_cw.compute_parity(Some(&self.rs_coder))?;
 
@@ -327,9 +327,9 @@ impl RSPaxosReplica {
             assert_eq!(old_inst.status, Status::Null);
             old_inst.reqs_cw = reqs_cw;
             old_inst.leader_bk = Some(LeaderBookkeeping {
-                prepare_acks: ReplicaMap::new(self.population, false),
+                prepare_acks: Bitmap::new(self.population, false),
                 prepare_max_bal: 0,
-                accept_acks: ReplicaMap::new(self.population, false),
+                accept_acks: Bitmap::new(self.population, false),
             });
         } else {
             let new_inst = Instance {
@@ -337,9 +337,9 @@ impl RSPaxosReplica {
                 status: Status::Null,
                 reqs_cw,
                 leader_bk: Some(LeaderBookkeeping {
-                    prepare_acks: ReplicaMap::new(self.population, false),
+                    prepare_acks: Bitmap::new(self.population, false),
                     prepare_max_bal: 0,
-                    accept_acks: ReplicaMap::new(self.population, false),
+                    accept_acks: Bitmap::new(self.population, false),
                 }),
                 replica_bk: None,
             };
@@ -407,7 +407,7 @@ impl RSPaxosReplica {
                         ballot: inst.bal,
                         // persist only one shard on myself
                         reqs_cw: inst.reqs_cw.subset_copy(
-                            HashSet::from([self.id as usize]),
+                            Bitmap::from(self.population, vec![self.id]),
                             false,
                         )?,
                     },
@@ -427,7 +427,7 @@ impl RSPaxosReplica {
                         slot,
                         ballot: inst.bal,
                         reqs_cw: inst.reqs_cw.subset_copy(
-                            HashSet::from([peer as usize]),
+                            Bitmap::from(self.population, vec![peer]),
                             false,
                         )?,
                     },
@@ -530,14 +530,12 @@ impl RSPaxosReplica {
                     break;
                 }
 
-                if inst.reqs_cw.avail_shards() < self.quorum_cnt as usize {
+                if inst.reqs_cw.avail_shards() < self.quorum_cnt {
                     // can't execute if I don't have the complete request batch
                     pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})",
                                        slot, inst.reqs_cw.avail_shards(), self.quorum_cnt);
                     break;
-                } else if inst.reqs_cw.avail_data_shards()
-                    < self.quorum_cnt as usize
-                {
+                } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
                     // have enough shards but need reconstruction
                     inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
                 }
@@ -613,8 +611,8 @@ impl RSPaxosReplica {
                     bal: 0,
                     status: Status::Null,
                     reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                        self.quorum_cnt as usize,
-                        (self.population - self.quorum_cnt) as usize,
+                        self.quorum_cnt,
+                        self.population - self.quorum_cnt,
                     )?,
                     leader_bk: None,
                     replica_bk: None,
@@ -655,7 +653,7 @@ impl RSPaxosReplica {
     ) -> Result<(), SummersetError> {
         pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}",
                            peer, slot, ballot,
-                           voted.as_ref().map(|(_, cw)| cw.avail_shards_set()));
+                           voted.as_ref().map(|(_, cw)| cw.avail_shards_map()));
 
         // if ballot is what I'm currently waiting on for Prepare replies:
         if ballot == self.bal_prep_sent {
@@ -695,7 +693,7 @@ impl RSPaxosReplica {
             // instance using the request batch value constructed using shards
             // with the highest ballot number in quorum
             if leader_bk.prepare_acks.count() >= self.quorum_cnt
-                && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt as usize
+                && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt
             {
                 inst.status = Status::Accepting;
                 pf_debug!(self.id; "enter Accept phase for slot {} bal {}",
@@ -706,7 +704,7 @@ impl RSPaxosReplica {
                 self.bal_prepared = ballot;
 
                 // if parity shards not computed yet, compute them now
-                if inst.reqs_cw.avail_shards() < self.population as usize {
+                if inst.reqs_cw.avail_shards() < self.population {
                     inst.reqs_cw.compute_parity(Some(&self.rs_coder))?;
                 }
 
@@ -718,7 +716,7 @@ impl RSPaxosReplica {
                             slot,
                             ballot,
                             reqs_cw: inst.reqs_cw.subset_copy(
-                                HashSet::from([self.id as usize]),
+                                Bitmap::from(self.population, vec![self.id]),
                                 false,
                             )?,
                         },
@@ -738,7 +736,7 @@ impl RSPaxosReplica {
                             slot,
                             ballot,
                             reqs_cw: inst.reqs_cw.subset_copy(
-                                HashSet::from([peer as usize]),
+                                Bitmap::from(self.population, vec![peer]),
                                 false,
                             )?,
                         },
@@ -762,7 +760,7 @@ impl RSPaxosReplica {
         reqs_cw: RSCodeword<ReqBatch>,
     ) -> Result<(), SummersetError> {
         pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}",
-                           peer, slot, ballot, reqs_cw.avail_shards_set());
+                           peer, slot, ballot, reqs_cw.avail_shards_map());
 
         // if ballot is not smaller than what I have made promises for:
         if ballot >= self.bal_max_seen {
@@ -772,8 +770,8 @@ impl RSPaxosReplica {
                     bal: 0,
                     status: Status::Null,
                     reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                        self.quorum_cnt as usize,
-                        (self.population - self.quorum_cnt) as usize,
+                        self.quorum_cnt,
+                        self.population - self.quorum_cnt,
                     )?,
                     leader_bk: None,
                     replica_bk: None,
@@ -886,8 +884,8 @@ impl RSPaxosReplica {
                 bal: 0,
                 status: Status::Null,
                 reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                    self.quorum_cnt as usize,
-                    (self.population - self.quorum_cnt) as usize,
+                    self.quorum_cnt,
+                    self.population - self.quorum_cnt,
                 )?,
                 leader_bk: None,
                 replica_bk: None,
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index d212f98b..eb082de3 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -7,7 +7,7 @@
 use std::path::Path;
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, ReplicaMap};
+use crate::utils::{SummersetError, Bitmap};
 use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
 use crate::server::{
     ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi,
@@ -80,7 +80,7 @@ enum PushMsg {
 struct Instance {
     reqs: Vec<(ClientId, ApiRequest)>,
     durable: bool,
-    pending_peers: ReplicaMap,
+    pending_peers: Bitmap,
     execed: Vec<bool>,
     from_peer: Option<(ReplicaId, usize)>, // peer ID, peer inst_idx
 }
@@ -148,7 +148,7 @@ impl SimplePushReplica {
         assert!(batch_size > 0);
 
         // target peers to push to
-        let mut target = ReplicaMap::new(self.population, false);
+        let mut target = Bitmap::new(self.population, false);
         let mut peer_cnt = 0;
         for peer in 0..self.population {
             if peer_cnt == self.config.rep_degree {
@@ -262,7 +262,7 @@ impl SimplePushReplica {
         let inst = Instance {
             reqs: req_batch.clone(),
             durable: false,
-            pending_peers: ReplicaMap::new(self.population, false),
+            pending_peers: Bitmap::new(self.population, false),
             execed: vec![false; req_batch.len()],
             from_peer: Some((peer, src_inst_idx)),
         };
diff --git a/src/server/transport.rs b/src/server/transport.rs
index fb4bae0e..10deff8b 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -3,7 +3,7 @@
 use std::fmt;
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, ReplicaMap, safe_tcp_read, safe_tcp_write};
+use crate::utils::{SummersetError, Bitmap, safe_tcp_read, safe_tcp_write};
 use crate::server::ReplicaId;
 
 use bytes::BytesMut;
@@ -144,10 +144,10 @@ where
         }
     }
 
-    /// Gets a ReplicaMap where currently connected peers are set true.
-    pub fn current_peers(&self) -> Result<ReplicaMap, SummersetError> {
+    /// Gets a bitmap where currently connected peers are set true.
+    pub fn current_peers(&self) -> Result<Bitmap, SummersetError> {
         let tx_sends_guard = self.tx_sends.guard();
-        let mut peers = ReplicaMap::new(self.population, false);
+        let mut peers = Bitmap::new(self.population, false);
         for &id in tx_sends_guard.keys() {
             if let Err(e) = peers.set(id, true) {
                 return logged_err!(self.me; "error setting peer {}: {}",
@@ -187,7 +187,7 @@ where
     pub fn bcast_msg(
         &mut self,
         msg: Msg,
-        target: Option<ReplicaMap>,
+        target: Option<Bitmap>,
     ) -> Result<(), SummersetError> {
         let tx_sends_guard = self.tx_sends.guard();
         for &peer in tx_sends_guard.keys() {
@@ -624,7 +624,7 @@ mod transport_tests {
         assert!(id == 1 || id == 2);
         assert_eq!(msg, TestMsg("world".into()));
         // send another message to 1 only
-        let mut map = ReplicaMap::new(3, false);
+        let mut map = Bitmap::new(3, false);
         map.set(1, true)?;
         hub.bcast_msg(TestMsg("nice".into()), Some(map))?;
         // recv another message from 1
diff --git a/src/utils/bitmap.rs b/src/utils/bitmap.rs
index a7f27d98..dfbb8467 100644
--- a/src/utils/bitmap.rs
+++ b/src/utils/bitmap.rs
@@ -1,15 +1,16 @@
 //! Bitmap data structure helper.
 
+use std::fmt;
+
 use crate::utils::SummersetError;
-use crate::server::ReplicaId;
 
 use fixedbitset::FixedBitSet;
 
-/// Compact bitmap for replica ID -> bool mapping.
-#[derive(Debug, Clone)]
-pub struct ReplicaMap(FixedBitSet);
+/// Compact bitmap for u8 ID -> bool mapping.
+#[derive(Clone, PartialEq, Eq)]
+pub struct Bitmap(FixedBitSet);
 
-impl ReplicaMap {
+impl Bitmap {
     /// Creates a new bitmap of given size. If `ones` is true, all slots are
     /// marked true initially; otherwise, all slots are initially false.
     pub fn new(size: u8, ones: bool) -> Self {
@@ -17,18 +18,31 @@ impl ReplicaMap {
             panic!("invalid bitmap size {}", size);
         }
         let mut bitset = FixedBitSet::with_capacity(size as usize);
+
         if ones {
             bitset.set_range(.., true);
         }
-        ReplicaMap(bitset)
+
+        Bitmap(bitset)
+    }
+
+    /// Creates a new bitmap of given size from vec literal. Indices in the
+    /// vec are bits to be set as true.
+    pub fn from(size: u8, ones: Vec<u8>) -> Self {
+        let mut bitmap = Self::new(size, false);
+
+        for idx in ones {
+            if let Err(e) = bitmap.set(idx, true) {
+                panic!("{}", e);
+            }
+        }
+
+        bitmap
     }
 
     /// Sets bit at index to given flag.
-    pub fn set(
-        &mut self,
-        idx: ReplicaId,
-        flag: bool,
-    ) -> Result<(), SummersetError> {
+    #[inline]
+    pub fn set(&mut self, idx: u8, flag: bool) -> Result<(), SummersetError> {
         if idx as usize >= self.0.len() {
             return Err(SummersetError(format!("index {} out of bound", idx)));
         }
@@ -37,7 +51,8 @@ impl ReplicaMap {
     }
 
     /// Gets the bit flag at index.
-    pub fn get(&self, idx: ReplicaId) -> Result<bool, SummersetError> {
+    #[inline]
+    pub fn get(&self, idx: u8) -> Result<bool, SummersetError> {
         if idx as usize >= self.0.len() {
             return Err(SummersetError(format!("index {} out of bound", idx)));
         }
@@ -45,33 +60,36 @@ impl ReplicaMap {
     }
 
     /// Returns the size of the bitmap.
+    #[inline]
     pub fn size(&self) -> u8 {
         self.0.len() as u8
     }
 
     /// Returns the number of trues in the bitmap.
+    #[inline]
     pub fn count(&self) -> u8 {
         self.0.count_ones(..) as u8
     }
 
     /// Allows `for (id, bit) in map.iter()`.
-    pub fn iter(&self) -> ReplicaMapIter {
-        ReplicaMapIter { map: self, idx: 0 }
+    #[inline]
+    pub fn iter(&self) -> BitmapIter {
+        BitmapIter { map: self, idx: 0 }
     }
 }
 
-/// Iterator over `ReplicaMap`, yielding `(id, bit)` pairs.
+/// Iterator over `Bitmap`, yielding `(id, bit)` pairs.
 #[derive(Debug, Clone)]
-pub struct ReplicaMapIter<'m> {
-    map: &'m ReplicaMap,
+pub struct BitmapIter<'m> {
+    map: &'m Bitmap,
     idx: usize,
 }
 
-impl Iterator for ReplicaMapIter<'_> {
-    type Item = (ReplicaId, bool);
+impl Iterator for BitmapIter<'_> {
+    type Item = (u8, bool);
 
     fn next(&mut self) -> Option<Self::Item> {
-        let id: ReplicaId = self.idx as ReplicaId;
+        let id: u8 = self.idx as u8;
         if id < self.map.size() {
             self.idx += 1;
             Some((id, self.map.get(id).unwrap()))
@@ -81,6 +99,26 @@ impl Iterator for ReplicaMapIter<'_> {
     }
 }
 
+// Implement `Debug` trait manually for better trace printing.
+impl fmt::Debug for Bitmap {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{{{}; [", self.size())?;
+        let mut first_idx = true;
+        for i in self
+            .iter()
+            .filter_map(|(i, flag)| if flag { Some(i) } else { None })
+        {
+            if !first_idx {
+                write!(f, ", {}", i)?;
+            } else {
+                write!(f, "{}", i)?;
+                first_idx = false;
+            }
+        }
+        write!(f, "]}}")
+    }
+}
+
 #[cfg(test)]
 mod bitmap_tests {
     use super::*;
@@ -88,12 +126,12 @@ mod bitmap_tests {
     #[test]
     #[should_panic]
     fn bitmap_new_panic() {
-        ReplicaMap::new(0, true);
+        Bitmap::new(0, true);
     }
 
     #[test]
     fn bitmap_set_get() {
-        let mut map = ReplicaMap::new(7, false);
+        let mut map = Bitmap::new(7, false);
         assert!(map.set(0, true).is_ok());
         assert!(map.set(1, false).is_ok());
         assert!(map.set(2, true).is_ok());
@@ -107,7 +145,7 @@ mod bitmap_tests {
 
     #[test]
     fn bitmap_count() {
-        let mut map = ReplicaMap::new(7, false);
+        let mut map = Bitmap::new(7, false);
         assert_eq!(map.count(), 0);
         assert!(map.set(0, true).is_ok());
         assert!(map.set(2, true).is_ok());
@@ -118,7 +156,7 @@ mod bitmap_tests {
     #[test]
     fn bitmap_iter() {
         let ref_map = vec![true, true, false, true, true];
-        let mut map = ReplicaMap::new(5, true);
+        let mut map = Bitmap::new(5, true);
         assert!(map.set(2, false).is_ok());
         for (id, flag) in map.iter() {
             assert_eq!(ref_map[id as usize], flag);
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index 6feb3e1e..7510b772 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -13,7 +13,7 @@ mod safetcp;
 mod rscoding;
 
 pub use error::SummersetError;
-pub use bitmap::ReplicaMap;
+pub use bitmap::Bitmap;
 pub use timer::Timer;
 pub use safetcp::{safe_tcp_read, safe_tcp_write};
 pub use rscoding::RSCodeword;
diff --git a/src/utils/rscoding.rs b/src/utils/rscoding.rs
index 659077ac..49c008a3 100644
--- a/src/utils/rscoding.rs
+++ b/src/utils/rscoding.rs
@@ -2,10 +2,9 @@
 
 use std::fmt;
 use std::io;
-use std::collections::HashSet;
 use std::marker::PhantomData;
 
-use crate::utils::SummersetError;
+use crate::utils::{SummersetError, Bitmap};
 
 use bytes::{BytesMut, BufMut};
 
@@ -20,10 +19,10 @@ use reed_solomon_erasure::galois_8::ReedSolomon;
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct RSCodeword<T> {
     /// Number of data shards.
-    num_data_shards: usize,
+    num_data_shards: u8,
 
     /// Number of parity shards.
-    num_parity_shards: usize,
+    num_parity_shards: u8,
 
     /// Exact length of original data in bytes.
     data_len: usize,
@@ -53,13 +52,13 @@ where
         data_copy: Option<T>,
         data_bytes: Option<BytesMut>,
         data_len: usize,
-        num_data_shards: usize,
-        num_parity_shards: usize,
+        num_data_shards: u8,
+        num_parity_shards: u8,
     ) -> Result<Self, SummersetError> {
         if num_data_shards == 0 {
             return Err(SummersetError("num_data_shards is zero".into()));
         }
-        if data_len != 0 && data_len < num_data_shards {
+        if data_len != 0 && data_len < num_data_shards as usize {
             return Err(SummersetError(format!(
                 "data length too small: {}",
                 data_len
@@ -67,10 +66,10 @@ where
         }
 
         let num_total_shards = num_data_shards + num_parity_shards;
-        let shard_len = if data_len % num_data_shards == 0 {
-            data_len / num_data_shards
+        let shard_len = if data_len % num_data_shards as usize == 0 {
+            data_len / num_data_shards as usize
         } else {
-            (data_len / num_data_shards) + 1
+            (data_len / num_data_shards as usize) + 1
         };
 
         let shards = if let Some(mut data_bytes) = data_bytes {
@@ -78,11 +77,11 @@ where
             assert_eq!(data_bytes.len(), data_len);
 
             // pad length to multiple of num_data_shards and compute shard size
-            let padded_len = shard_len * num_data_shards;
+            let padded_len = shard_len * num_data_shards as usize;
             data_bytes.resize(padded_len, 0);
 
             // split the bytes representation into contiguously stored shards
-            let mut shards = Vec::with_capacity(num_data_shards);
+            let mut shards = Vec::with_capacity(num_data_shards as usize);
             for _ in 0..(num_data_shards - 1) {
                 let shard = data_bytes.split_to(shard_len);
                 assert_eq!(shard.len(), shard_len);
@@ -90,15 +89,15 @@ where
             }
             assert_eq!(data_bytes.len(), shard_len);
             shards.push(Some(data_bytes)); // the last shard
-            assert_eq!(shards.len(), num_data_shards);
+            assert_eq!(shards.len(), num_data_shards as usize);
             for _ in num_data_shards..num_total_shards {
                 shards.push(None);
             }
-            assert_eq!(shards.len(), num_total_shards);
+            assert_eq!(shards.len(), num_total_shards as usize);
             shards
         } else {
             // if newing from empty
-            vec![None; num_total_shards]
+            vec![None; num_total_shards as usize]
         };
 
         Ok(RSCodeword {
@@ -115,8 +114,8 @@ where
     /// Creates a new RSCodeword from original data.
     pub fn from_data(
         data: T,
-        num_data_shards: usize,
-        num_parity_shards: usize,
+        num_data_shards: u8,
+        num_parity_shards: u8,
     ) -> Result<Self, SummersetError> {
         // serialize original data into bytes
         let mut data_writer = BytesMut::new().writer();
@@ -133,8 +132,8 @@ where
 
     /// Creates a new RSCodeword from empty bytes.
     pub fn from_null(
-        num_data_shards: usize,
-        num_parity_shards: usize,
+        num_data_shards: u8,
+        num_parity_shards: u8,
     ) -> Result<Self, SummersetError> {
         Self::new(None, None, 0, num_data_shards, num_parity_shards)
     }
@@ -143,15 +142,25 @@ where
     /// shards, and a complete copy of the original data if required.
     pub fn subset_copy(
         &self,
-        subset: HashSet<usize>,
+        subset: Bitmap,
         copy_data: bool,
     ) -> Result<Self, SummersetError> {
         if self.data_len == 0 {
             return Err(SummersetError("codeword is null".into()));
         }
 
-        let mut shards = vec![None; self.num_shards()];
-        for i in subset {
+        let mut shards = vec![None; self.num_shards() as usize];
+        for i in
+            subset.iter().filter_map(
+                |(i, flag)| {
+                    if flag {
+                        Some(i as usize)
+                    } else {
+                        None
+                    }
+                },
+            )
+        {
             if i >= shards.len() {
                 return Err(SummersetError(format!(
                     "shard index {} out-of-bound",
@@ -231,60 +240,71 @@ where
     }
 
     /// Gets number of data shards.
-    pub fn num_data_shards(&self) -> usize {
+    #[inline]
+    pub fn num_data_shards(&self) -> u8 {
         self.num_data_shards
     }
 
     /// Gets number of parity shards.
     #[allow(dead_code)]
-    pub fn num_parity_shards(&self) -> usize {
+    #[inline]
+    pub fn num_parity_shards(&self) -> u8 {
         self.num_parity_shards
     }
 
     /// Gets total number of shards.
-    pub fn num_shards(&self) -> usize {
-        self.shards.len()
+    #[inline]
+    pub fn num_shards(&self) -> u8 {
+        self.shards.len() as u8
     }
 
     /// Gets number of currently available data shards.
-    pub fn avail_data_shards(&self) -> usize {
+    #[inline]
+    pub fn avail_data_shards(&self) -> u8 {
         self.shards
             .iter()
-            .take(self.num_data_shards)
+            .take(self.num_data_shards as usize)
             .filter(|s| s.is_some())
-            .count()
+            .count() as u8
     }
 
     /// Gets number of currently available parity shards.
     #[allow(dead_code)]
-    pub fn avail_parity_shards(&self) -> usize {
+    #[inline]
+    pub fn avail_parity_shards(&self) -> u8 {
         self.shards
             .iter()
-            .skip(self.num_data_shards)
+            .skip(self.num_data_shards as usize)
             .filter(|s| s.is_some())
-            .count()
+            .count() as u8
     }
 
     /// Gets total number of currently available shards.
-    pub fn avail_shards(&self) -> usize {
-        self.shards.iter().filter(|s| s.is_some()).count()
+    #[inline]
+    pub fn avail_shards(&self) -> u8 {
+        self.shards.iter().filter(|s| s.is_some()).count() as u8
     }
 
-    /// Gets the set of available shard indexes.
-    pub fn avail_shards_set(&self) -> HashSet<usize> {
-        self.shards
+    /// Gets a bitmap of available shard indexes set true.
+    #[inline]
+    pub fn avail_shards_map(&self) -> Bitmap {
+        let ones: Vec<u8> = self
+            .shards
             .iter()
             .enumerate()
-            .filter_map(|(i, s)| if s.is_some() { Some(i) } else { None })
-            .collect()
+            .filter_map(|(i, s)| if s.is_some() { Some(i as u8) } else { None })
+            .collect();
+        Bitmap::from(self.num_shards(), ones)
     }
 
     /// Gets length of original data in bytes.
+    #[inline]
     pub fn data_len(&self) -> usize {
         self.data_len
     }
 
     /// Gets length of a shard in bytes.
+    #[inline]
     pub fn shard_len(&self) -> usize {
         self.shard_len
     }
@@ -295,13 +315,13 @@ where
         &self,
         rs: &ReedSolomon,
     ) -> Result<(), SummersetError> {
-        if rs.data_shard_count() != self.num_data_shards {
+        if rs.data_shard_count() != self.num_data_shards as usize {
             Err(SummersetError(format!(
                 "num_data_shards mismatch: expected {}, rs {}",
                 self.num_data_shards,
                 rs.data_shard_count()
             )))
-        } else if rs.parity_shard_count() != self.num_parity_shards {
+        } else if rs.parity_shard_count() != self.num_parity_shards as usize {
             Err(SummersetError(format!(
                 "num_parity_shards mismatch: expected {}, rs {}",
                 self.num_parity_shards,
@@ -339,7 +359,8 @@ where
         }
 
         // allocate space for parity shards if haven't
-        for shard in self.shards.iter_mut().skip(self.num_data_shards) {
+        for shard in self.shards.iter_mut().skip(self.num_data_shards as usize)
+        {
             if shard.is_none() {
                 *shard = Some(BytesMut::zeroed(self.shard_len));
             }
@@ -473,23 +494,23 @@ struct ShardsReader<'a> {
     shards: &'a Vec<Option<BytesMut>>,
 
     /// Number of data shards in vec.
-    num_data_shards: usize,
+    num_data_shards: u8,
 
     /// Length in bytes of a shard.
     shard_len: usize,
 
     /// Composite cursor: (shard_idx, byte_idx).
-    cursor: (usize, usize),
+    cursor: (u8, usize),
 }
 
 impl<'a> ShardsReader<'a> {
     /// Creates a new temporary reader.
     fn new(
         shards: &'a Vec<Option<BytesMut>>,
-        num_data_shards: usize,
+        num_data_shards: u8,
         shard_len: usize,
     ) -> Result<Self, SummersetError> {
-        for shard in shards.iter().take(num_data_shards) {
+        for shard in shards.iter().take(num_data_shards as usize) {
             if shard.is_none() {
                 return Err(SummersetError("some data shard is None".into()));
             }
@@ -510,8 +531,9 @@ impl<'a> io::Read for ShardsReader<'a> {
         let mut total_nread = 0;
 
         while self.cursor.0 < self.num_data_shards {
-            let mut slice = &(self.shards[self.cursor.0].as_ref().unwrap())
-                [self.cursor.1..];
+            let mut slice = &(self.shards[self.cursor.0 as usize]
+                .as_ref()
+                .unwrap())[self.cursor.1..];
             let (_, buf_tail) = buf.split_at_mut(total_nread);
 
             let shard_nread = slice.read(buf_tail).unwrap();
@@ -569,7 +591,7 @@ mod rscoding_tests {
         assert_eq!(cw.avail_data_shards(), 3);
         assert_eq!(cw.avail_parity_shards(), 0);
         assert_eq!(cw.avail_shards(), 3);
-        assert_eq!(cw.avail_shards_set(), HashSet::from([0, 1, 2]));
+        assert_eq!(cw.avail_shards_map(), Bitmap::from(3, vec![0, 1, 2]));
         assert_eq!(cw.data_len(), data_len);
         assert_eq!(cw.shard_len(), shard_len);
         // valid with num_parity_shards > 0
@@ -580,7 +602,7 @@ mod rscoding_tests {
         assert_eq!(cw.avail_data_shards(), 3);
         assert_eq!(cw.avail_parity_shards(), 0);
         assert_eq!(cw.avail_shards(), 3);
-        assert_eq!(cw.avail_shards_set(), HashSet::from([0, 1, 2]));
+        assert_eq!(cw.avail_shards_map(), Bitmap::from(5, vec![0, 1, 2]));
         assert_eq!(cw.data_len(), data_len);
         assert_eq!(cw.shard_len(), shard_len);
         Ok(())
@@ -598,7 +620,7 @@ mod rscoding_tests {
         assert_eq!(cw.avail_data_shards(), 0);
         assert_eq!(cw.avail_parity_shards(), 0);
         assert_eq!(cw.avail_shards(), 0);
-        assert_eq!(cw.avail_shards_set(), HashSet::new());
+        assert_eq!(cw.avail_shards_map(), Bitmap::new(5, false));
         assert_eq!(cw.data_len(), 0);
         assert_eq!(cw.shard_len(), 0);
         Ok(())
@@ -609,21 +631,21 @@ mod rscoding_tests {
         let data = TestData("interesting_value".into());
         let cwa = RSCodeword::from_data(data.clone(), 3, 2)?;
         // invalid subset
-        assert!(cwa.subset_copy(HashSet::from([0, 5]), false).is_err());
+        assert!(cwa.subset_copy(Bitmap::from(6, vec![0, 5]), false).is_err());
         // valid subsets
-        let cw01 = cwa.subset_copy(HashSet::from([0, 1]), false)?;
+        let cw01 = cwa.subset_copy(Bitmap::from(5, vec![0, 1]), false)?;
         assert_eq!(cw01.avail_data_shards(), 2);
-        let cw02 = cwa.subset_copy(HashSet::from([0, 2]), true)?;
+        let cw02 = cwa.subset_copy(Bitmap::from(5, vec![0, 2]), true)?;
         assert_eq!(cw02.avail_data_shards(), 2);
         assert!(cw02.data_copy.is_some());
         // valid absorbing
         let mut cwb = RSCodeword::<TestData>::from_null(3, 2)?;
         cwb.absorb_other(cw02)?;
         assert_eq!(cwb.avail_shards(), 2);
-        assert_eq!(cwb.avail_shards_set(), HashSet::from([0, 2]));
+        assert_eq!(cwb.avail_shards_map(), Bitmap::from(5, vec![0, 2]));
         cwb.absorb_other(cw01)?;
         assert_eq!(cwb.avail_shards(), 3);
-        assert_eq!(cwb.avail_shards_set(), HashSet::from([0, 1, 2]));
+        assert_eq!(cwb.avail_shards_map(), Bitmap::from(5, vec![0, 1, 2]));
         assert_eq!(*cwb.get_data()?, data);
         // invalid absorbing
         assert!(cwb

From 7312afbb471e865e31a40e3b306a1bbfc2a224ac Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Tue, 29 Aug 2023 20:51:29 +0800
Subject: [PATCH 06/89] fix crossword ack pattern bug

---
 src/protocols/crossword.rs | 118 +++++++++++++++++++++++++++++--------
 1 file changed, 93 insertions(+), 25 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 7f6bc743..da59f976 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -89,8 +89,9 @@ struct LeaderBookkeeping {
     /// Max ballot among received Prepare replies.
     prepare_max_bal: Ballot,
 
-    /// Replicas from which I have received Accept confirmations.
-    accept_acks: Bitmap,
+    /// Replicas and their assigned shards which the received Accept
+    /// confirmations cover.
+    accept_acks: HashMap<ReplicaId, Bitmap>,
 }
 
 /// Follower-side bookkeeping info for each instance received.
@@ -284,9 +285,51 @@ impl CrosswordReplica {
         id: ReplicaId,
         population: u8,
         num_shards: u8,
-    ) -> Bitmap {
-        let ones = (id..(id + num_shards)).map(|i| (i % population)).collect();
-        Bitmap::from(population, ones)
+    ) -> Vec<u8> {
+        (id..(id + num_shards)).map(|i| (i % population)).collect()
+    }
+
+    /// TODO: make better impl of this.
+    fn coverage_under_faults(
+        population: u8,
+        acks: &HashMap<ReplicaId, Bitmap>,
+        fault_tolerance: u8,
+    ) -> u8 {
+        if acks.len() <= fault_tolerance as usize {
+            return 0;
+        }
+
+        // enumerate all subsets of acks excluding fault number of replicas
+        let cnt = (acks.len() - fault_tolerance as usize) as u32;
+        let servers: Vec<ReplicaId> = acks.keys().cloned().collect();
+        let mut min_coverage = population;
+
+        for n in (0..2usize.pow(servers.len() as u32))
+            .filter(|n| n.count_ones() == cnt)
+        {
+            let mut coverage = Bitmap::new(population, false);
+            for (_, server) in servers
+                .iter()
+                .enumerate()
+                .filter(|&(i, _)| (n >> i) % 2 == 1)
+            {
+                for shard in acks[server].iter().filter_map(|(s, flag)| {
+                    if flag {
+                        Some(s)
+                    } else {
+                        None
+                    }
+                }) {
+                    coverage.set(shard, true).expect("impossible shard index");
+                }
+            }
+
+            if coverage.count() < min_coverage {
+                min_coverage = coverage.count();
+            }
+        }
+
+        min_coverage
     }
 
     /// Handler of client request batch chan recv.
@@ -344,7 +387,7 @@ impl CrosswordReplica {
             old_inst.leader_bk = Some(LeaderBookkeeping {
                 prepare_acks: Bitmap::new(self.population, false),
                 prepare_max_bal: 0,
-                accept_acks: Bitmap::new(self.population, false),
+                accept_acks: HashMap::new(),
             });
         } else {
             let new_inst = Instance {
@@ -354,7 +397,7 @@ impl CrosswordReplica {
                 leader_bk: Some(LeaderBookkeeping {
                     prepare_acks: Bitmap::new(self.population, false),
                     prepare_max_bal: 0,
-                    accept_acks: Bitmap::new(self.population, false),
+                    accept_acks: HashMap::new(),
                 }),
                 replica_bk: None,
             };
@@ -422,10 +465,13 @@ impl CrosswordReplica {
                         ballot: inst.bal,
                         // persist only some shards on myself
                         reqs_cw: inst.reqs_cw.subset_copy(
-                            Self::shards_for_replica(
-                                self.id,
+                            Bitmap::from(
                                 self.population,
-                                self.config.shards_per_replica,
+                                Self::shards_for_replica(
+                                    self.id,
+                                    self.population,
+                                    self.config.shards_per_replica,
+                                ),
                             ),
                             false,
                         )?,
@@ -447,10 +493,13 @@ impl CrosswordReplica {
                         slot,
                         ballot: inst.bal,
                         reqs_cw: inst.reqs_cw.subset_copy(
-                            Self::shards_for_replica(
-                                peer,
+                            Bitmap::from(
                                 self.population,
-                                self.config.shards_per_replica,
+                                Self::shards_for_replica(
+                                    peer,
+                                    self.population,
+                                    self.config.shards_per_replica,
+                                ),
                             ),
                             false,
                         )?,
@@ -740,10 +789,13 @@ impl CrosswordReplica {
                             slot,
                             ballot,
                             reqs_cw: inst.reqs_cw.subset_copy(
-                                Self::shards_for_replica(
-                                    self.id,
+                                Bitmap::from(
                                     self.population,
-                                    self.config.shards_per_replica,
+                                    Self::shards_for_replica(
+                                        self.id,
+                                        self.population,
+                                        self.config.shards_per_replica,
+                                    ),
                                 ),
                                 false,
                             )?,
@@ -764,10 +816,13 @@ impl CrosswordReplica {
                             slot,
                             ballot,
                             reqs_cw: inst.reqs_cw.subset_copy(
-                                Self::shards_for_replica(
-                                    peer,
+                                Bitmap::from(
                                     self.population,
-                                    self.config.shards_per_replica,
+                                    Self::shards_for_replica(
+                                        peer,
+                                        self.population,
+                                        self.config.shards_per_replica,
+                                    ),
                                 ),
                                 false,
                             )?,
@@ -862,18 +917,31 @@ impl CrosswordReplica {
             assert!(self.bal_max_seen >= ballot);
             assert!(inst.leader_bk.is_some());
             let leader_bk = inst.leader_bk.as_mut().unwrap();
-            if leader_bk.accept_acks.get(peer)? {
+            if leader_bk.accept_acks.contains_key(&peer) {
                 return Ok(());
             }
 
             // bookkeep this Accept reply
-            leader_bk.accept_acks.set(peer, true)?;
+            leader_bk.accept_acks.insert(
+                peer,
+                Bitmap::from(
+                    self.population,
+                    Self::shards_for_replica(
+                        peer,
+                        self.population,
+                        self.config.shards_per_replica,
+                    ),
+                ),
+            );
 
             // if quorum size reached AND enough number of shards are
-            // remembered, mark this instance as committed; in RS-Paxos, this
-            // means accept_acks.count() >= self.quorum_cnt + fault_tolerance
-            if leader_bk.accept_acks.count()
-                >= self.quorum_cnt + self.config.fault_tolerance
+            // remembered, mark this instance as committed
+            if leader_bk.accept_acks.len() as u8 >= self.quorum_cnt
+                && Self::coverage_under_faults(
+                    self.population,
+                    &leader_bk.accept_acks,
+                    self.config.fault_tolerance,
+                ) >= self.quorum_cnt
             {
                 inst.status = Status::Committed;
                 pf_debug!(self.id; "committed instance at slot {} bal {}",

From 6f2e0c7e5dce3cf31e1df1aaf94b65a4416f532e Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 13:25:28 +0800
Subject: [PATCH 07/89] minor updates to bench script

---
 scripts/local_bench.tmp.py | 39 ++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index c20ad6d5..8aaa6374 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -144,7 +144,7 @@ def all_protocol_configs(num_replicas):
         max_fault_tolerance = num_replicas - quorum_cnt
 
         config_choices = [("MultiPaxos", None, None)]
-        for shards_per_replica in range(quorum_cnt, 0):
+        for shards_per_replica in range(quorum_cnt, 0, -1):
             config_choices.append(
                 ("Crossword", max_fault_tolerance, shards_per_replica)
             )
@@ -152,20 +152,27 @@ def all_protocol_configs(num_replicas):
 
         return config_choices
 
-    # for num_replicas in (3, 5, 7):
-    #     for value_size in (1024, 65536, 4194304):
-    #         for protocol, fault_tolerance, shards_per_replica in all_protocol_configs(
-    #             num_replicas
-    #         ):
-    #             bench_round(
-    #                 protocol,
-    #                 num_replicas,
-    #                 value_size,
-    #                 100,
-    #                 60,
-    #                 fault_tolerance=fault_tolerance,
-    #                 shards_per_replica=shards_per_replica,
-    #             )
+    for num_replicas in (3, 5, 7):
+        for value_size in (1024, 65536, 4194304):
+            for protocol, fault_tolerance, shards_per_replica in all_protocol_configs(
+                num_replicas
+            ):
+                # print(
+                #     num_replicas,
+                #     value_size,
+                #     protocol,
+                #     fault_tolerance,
+                #     shards_per_replica,
+                # )
+                bench_round(
+                    protocol,
+                    num_replicas,
+                    value_size,
+                    100,
+                    60,
+                    fault_tolerance=fault_tolerance,
+                    shards_per_replica=shards_per_replica,
+                )
 
     bench_round("MultiPaxos", 5, 65536, 0, 60)
-    # bench_round("Crossword", 5, 65536, 0, 60, fault_tolerance=0, shards_per_replica=1)
+    bench_round("Crossword", 5, 65536, 0, 60, fault_tolerance=0, shards_per_replica=1)

From 7d11298a57d319a81f863ad4d4df002ec2795bd4 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 14:03:55 +0800
Subject: [PATCH 08/89] minor updates to bench script

---
 scripts/local_bench.tmp.py | 2 +-
 scripts/local_cluster.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 8aaa6374..ba3e2346 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -19,7 +19,7 @@ def run_process(cmd):
 def kill_all_matching(name):
     # print("Kill all:", name)
     assert name.count(" ") == 0
-    cmd = ["pkill", "-9", "-f", name]
+    cmd = ["sudo", "pkill", "-9", "-f", name]
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     proc.wait()
 
diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index ffbdff15..c4fa4f68 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -26,7 +26,7 @@ def run_process(cmd, capture_stderr=False):
 def kill_all_matching(name):
     print("Kill all:", name)
     assert name.count(" ") == 0
-    cmd = ["pkill", "-9", "-f", name]
+    cmd = ["sudo", "pkill", "-9", "-f", name]
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     proc.wait()
 

From 8353c08c5310293e55c4dad0d5bbf2e1b081124e Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 14:11:55 +0800
Subject: [PATCH 09/89] minor updates to bench script

---
 scripts/local_bench.tmp.py | 6 ++----
 scripts/local_cluster.py   | 5 ++---
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index ba3e2346..45085437 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -1,5 +1,5 @@
+import os
 import subprocess
-import itertools
 import statistics
 
 
@@ -19,9 +19,7 @@ def run_process(cmd):
 def kill_all_matching(name):
     # print("Kill all:", name)
     assert name.count(" ") == 0
-    cmd = ["sudo", "pkill", "-9", "-f", name]
-    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    proc.wait()
+    os.system(f"sudo pkill -9 -f {name}")
 
 
 def launch_cluster(protocol, num_replicas, config):
diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index c4fa4f68..f0cc1099 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -1,4 +1,5 @@
 import sys
+import os
 import argparse
 import subprocess
 from pathlib import Path
@@ -26,9 +27,7 @@ def run_process(cmd, capture_stderr=False):
 def kill_all_matching(name):
     print("Kill all:", name)
     assert name.count(" ") == 0
-    cmd = ["sudo", "pkill", "-9", "-f", name]
-    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    proc.wait()
+    os.system(f"sudo pkill -9 -f {name}")
 
 
 MANAGER_SRV_PORT = 52600

From b2621cbe65f7fc0395c261a99ce39306d3ea3bad Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 14:28:58 +0800
Subject: [PATCH 10/89] fixing scripts address already in use

---
 scripts/local_bench.tmp.py | 2 +-
 scripts/local_cluster.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 45085437..3776079f 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -19,7 +19,7 @@ def run_process(cmd):
 def kill_all_matching(name):
     # print("Kill all:", name)
     assert name.count(" ") == 0
-    os.system(f"sudo pkill -9 -f {name}")
+    os.system(f"killall -9 {name} > /dev/null 2>&1")
 
 
 def launch_cluster(protocol, num_replicas, config):
diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index f0cc1099..87fd94e3 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -27,7 +27,7 @@ def run_process(cmd, capture_stderr=False):
 def kill_all_matching(name):
     print("Kill all:", name)
     assert name.count(" ") == 0
-    os.system(f"sudo pkill -9 -f {name}")
+    os.system(f"killall -9 {name} > /dev/null 2>&1")
 
 
 MANAGER_SRV_PORT = 52600

From 0a79799d8b2b80f0c160b084602cd03aaf581230 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 14:54:13 +0800
Subject: [PATCH 11/89] fixing scripts address already in use

---
 scripts/local_bench.tmp.py |  5 ++++-
 src/manager/reactor.rs     |  6 ++++--
 src/manager/reigner.rs     |  6 ++++--
 src/server/external.rs     |  6 ++++--
 src/server/transport.rs    |  6 ++++--
 src/utils/mod.rs           |  2 +-
 src/utils/safetcp.rs       | 28 +++++++++++++++++++++++++---
 7 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 3776079f..ad51e517 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -108,7 +108,10 @@ def bench_round(
     shards_per_replica=None,
 ):
     print(
-        f"{protocol:<10s}  n={num_replicas:1d}  v={value_size:<9d}  w%={put_ratio:<3d}  {length_s:3d}s"
+        f"{protocol:<10s}  n={num_replicas:1d}  v={value_size:<9d}  "
+        + f"f={fault_tolerance if fault_tolerance is not None else 'x':1d}  "
+        + f"s={shards_per_replica if shards_per_replica is not None else 'x':1d}  "
+        + f"w%={put_ratio:<3d}  {length_s:3d}s"
     )
     kill_all_matching("summerset_client")
     kill_all_matching("summerset_server")
diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs
index 52b14f63..e3a1b198 100644
--- a/src/manager/reactor.rs
+++ b/src/manager/reactor.rs
@@ -3,7 +3,9 @@
 use std::collections::HashMap;
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write};
+use crate::utils::{
+    SummersetError, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry,
+};
 use crate::server::ReplicaId;
 use crate::client::ClientId;
 
@@ -74,7 +76,7 @@ impl ClientReactor {
         let (client_responder_handles_write, client_responder_handles_read) =
             flashmap::new::<ClientId, JoinHandle<()>>();
 
-        let client_listener = TcpListener::bind(cli_addr).await?;
+        let client_listener = tcp_bind_with_retry(cli_addr, 10).await?;
         let client_acceptor_handle =
             tokio::spawn(Self::client_acceptor_thread(
                 tx_req,
diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs
index 21658f92..cff8f18f 100644
--- a/src/manager/reigner.rs
+++ b/src/manager/reigner.rs
@@ -3,7 +3,9 @@
 use std::collections::HashMap;
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write};
+use crate::utils::{
+    SummersetError, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry,
+};
 use crate::server::ReplicaId;
 use crate::protocols::SmrProtocol;
 
@@ -71,7 +73,7 @@ impl ServerReigner {
         let (server_controller_handles_write, server_controller_handles_read) =
             flashmap::new::<ReplicaId, JoinHandle<()>>();
 
-        let server_listener = TcpListener::bind(srv_addr).await?;
+        let server_listener = tcp_bind_with_retry(srv_addr, 10).await?;
         let server_acceptor_handle =
             tokio::spawn(Self::server_acceptor_thread(
                 tx_recv,
diff --git a/src/server/external.rs b/src/server/external.rs
index 9c50b546..3083a662 100644
--- a/src/server/external.rs
+++ b/src/server/external.rs
@@ -3,7 +3,9 @@
 use std::net::SocketAddr;
 use std::sync::Arc;
 
-use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write};
+use crate::utils::{
+    SummersetError, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry,
+};
 use crate::server::{ReplicaId, Command, CommandResult};
 use crate::client::ClientId;
 
@@ -115,7 +117,7 @@ impl ExternalApi {
         let (client_servant_handles_write, client_servant_handles_read) =
             flashmap::new::<ClientId, JoinHandle<()>>();
 
-        let client_listener = TcpListener::bind(api_addr).await?;
+        let client_listener = tcp_bind_with_retry(api_addr, 10).await?;
         let client_acceptor_handle =
             tokio::spawn(Self::client_acceptor_thread(
                 me,
diff --git a/src/server/transport.rs b/src/server/transport.rs
index 10deff8b..ca121d70 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -3,7 +3,9 @@
 use std::fmt;
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, Bitmap, safe_tcp_read, safe_tcp_write};
+use crate::utils::{
+    SummersetError, Bitmap, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry,
+};
 use crate::server::ReplicaId;
 
 use bytes::BytesMut;
@@ -84,7 +86,7 @@ where
         let (tx_connect, rx_connect) = mpsc::unbounded_channel();
         let (tx_connack, rx_connack) = mpsc::unbounded_channel();
 
-        let peer_listener = TcpListener::bind(p2p_addr).await?;
+        let peer_listener = tcp_bind_with_retry(p2p_addr, 10).await?;
         let peer_acceptor_handle = tokio::spawn(Self::peer_acceptor_thread(
             me,
             tx_recv.clone(),
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index 7510b772..23a43006 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -15,5 +15,5 @@ mod rscoding;
 pub use error::SummersetError;
 pub use bitmap::Bitmap;
 pub use timer::Timer;
-pub use safetcp::{safe_tcp_read, safe_tcp_write};
+pub use safetcp::{safe_tcp_read, safe_tcp_write, tcp_bind_with_retry};
 pub use rscoding::RSCodeword;
diff --git a/src/utils/safetcp.rs b/src/utils/safetcp.rs
index 2e3cad88..6a0df26a 100644
--- a/src/utils/safetcp.rs
+++ b/src/utils/safetcp.rs
@@ -1,7 +1,9 @@
 //! Safe TCP read/write helpers that provides cancellation safety on the read
-//! side and deadlock avoidance on the write side.
+//! side and deadlock avoidance on the write side. Safe `TcpListener` binding
+//! wrapper that provides a retrying logic.
 
 use std::io::ErrorKind;
+use std::net::SocketAddr;
 
 use crate::utils::SummersetError;
 
@@ -13,7 +15,8 @@ use rmp_serde::encode::to_vec as encode_to_vec;
 use rmp_serde::decode::from_read as decode_from_read;
 
 use tokio::io::AsyncReadExt;
-use tokio::net::TcpStream;
+use tokio::net::{TcpStream, TcpListener};
+use tokio::time::{self, Duration};
 
 /// Receives an object of type `T` from TCP readable connection `conn_read`,
 /// using `read_buf` as buffer storage for partial reads. Returns:
@@ -140,4 +143,23 @@ where
     Ok(true)
 }
 
-// No unit tests for these two helpers...
+/// Wrapper over tokio `TcpListener::bind()` that provides a retrying logic.
+pub async fn tcp_bind_with_retry(
+    addr: SocketAddr,
+    mut retries: u8,
+) -> Result<TcpListener, SummersetError> {
+    loop {
+        match TcpListener::bind(addr).await {
+            Ok(listener) => return Ok(listener),
+            Err(e) => {
+                if retries == 0 {
+                    return Err(e.into());
+                }
+                retries -= 1;
+                time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    }
+}
+
+// No unit tests for these helpers...

From aec9b00d5f03c9dea0ae6dce87a9ee18539db9e6 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 15:01:37 +0800
Subject: [PATCH 12/89] fixing scripts address already in use

---
 scripts/local_bench.tmp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index ad51e517..96a03bf4 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -109,8 +109,8 @@ def bench_round(
 ):
     print(
         f"{protocol:<10s}  n={num_replicas:1d}  v={value_size:<9d}  "
-        + f"f={fault_tolerance if fault_tolerance is not None else 'x':1d}  "
-        + f"s={shards_per_replica if shards_per_replica is not None else 'x':1d}  "
+        + f"f={fault_tolerance if fault_tolerance is not None else 'x':1}  "
+        + f"s={shards_per_replica if shards_per_replica is not None else 'x':1}  "
         + f"w%={put_ratio:<3d}  {length_s:3d}s"
     )
     kill_all_matching("summerset_client")

From 9359dbe73e579f711c8f49eb9498c5abfec663f0 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 15:02:09 +0800
Subject: [PATCH 13/89] fixing scripts address already in use

---
 scripts/local_bench.tmp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 96a03bf4..1537f7c2 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -42,6 +42,7 @@ def wait_cluster_setup(proc, num_replicas):
 
     for line in iter(proc.stderr.readline, b""):
         l = line.decode()
+        print(l, end="")
         if "manager" not in l and "accepting clients" in l:
             replica = int(l[l.find("(") + 1 : l.find(")")])
             assert not accepting_clients[replica]

From 667f705cdf4fd7e6e60388cbd90fd83773bf5e89 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 17:29:30 +0800
Subject: [PATCH 14/89] add proper termination signals handling

---
 Cargo.lock                    | 22 ++++++++++++
 Cargo.toml                    |  1 +
 scripts/local_bench.tmp.py    | 12 ++++---
 scripts/local_cluster.py      | 29 ++++++++++++---
 src/manager/clusman.rs        | 23 ++++++++++--
 src/protocols/crossword.rs    | 17 ++++++++-
 src/protocols/multipaxos.rs   | 17 ++++++++-
 src/protocols/rep_nothing.rs  | 17 ++++++++-
 src/protocols/rs_paxos.rs     | 17 ++++++++-
 src/protocols/simple_push.rs  | 17 ++++++++-
 src/server/replica.rs         |  7 ++--
 src/server/transport.rs       |  2 +-
 src/utils/error.rs            |  1 +
 summerset_client/src/main.rs  |  3 +-
 summerset_manager/src/main.rs |  5 +--
 summerset_server/src/main.rs  | 66 ++++++++++++++++++++++-------------
 16 files changed, 209 insertions(+), 47 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 38874908..e9fc04f8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -230,6 +230,16 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
 
+[[package]]
+name = "ctrlc"
+version = "3.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a011bbe2c35ce9c1f143b7af6f94f29a167beb4cd1d29e6740ce836f723120e"
+dependencies = [
+ "nix",
+ "windows-sys",
+]
+
 [[package]]
 name = "dirs"
 version = "4.0.0"
@@ -607,6 +617,17 @@ dependencies = [
  "windows-sys",
 ]
 
+[[package]]
+name = "nix"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b"
+dependencies = [
+ "bitflags 1.3.2",
+ "cfg-if",
+ "libc",
+]
+
 [[package]]
 name = "nom"
 version = "5.1.3"
@@ -1095,6 +1116,7 @@ version = "0.1.0"
 dependencies = [
  "async-trait",
  "bytes",
+ "ctrlc",
  "fixedbitset",
  "flashmap",
  "futures",
diff --git a/Cargo.toml b/Cargo.toml
index 18f2bfe9..707f1150 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,3 +22,4 @@ serde = { version = "1.0", features = ["derive"] }
 toml = { version = "0.7", features = ["parse"] }
 log = "0.4"
 reed-solomon-erasure = { version = "6.0", features = ["simd-accel"] }
+ctrlc = { version = "3.4", features = ["termination"] }
diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 1537f7c2..03d9e9ba 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -16,10 +16,12 @@ def run_process(cmd):
     return proc
 
 
-def kill_all_matching(name):
+def kill_all_matching(name, force=False):
     # print("Kill all:", name)
     assert name.count(" ") == 0
-    os.system(f"killall -9 {name} > /dev/null 2>&1")
+    cmd = "killall -9" if force else "killall"
+    cmd += f" {name} > /dev/null 2>&1"
+    os.system(cmd)
 
 
 def launch_cluster(protocol, num_replicas, config):
@@ -114,9 +116,9 @@ def bench_round(
         + f"s={shards_per_replica if shards_per_replica is not None else 'x':1}  "
         + f"w%={put_ratio:<3d}  {length_s:3d}s"
     )
-    kill_all_matching("summerset_client")
-    kill_all_matching("summerset_server")
-    kill_all_matching("summerset_manager")
+    kill_all_matching("summerset_client", force=True)
+    kill_all_matching("summerset_server", force=True)
+    kill_all_matching("summerset_manager", force=True)
 
     configs = []
     if fault_tolerance is not None:
diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index 87fd94e3..c4e0877c 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -1,5 +1,6 @@
 import sys
 import os
+import signal
 import argparse
 import subprocess
 from pathlib import Path
@@ -24,10 +25,12 @@ def run_process(cmd, capture_stderr=False):
     return proc
 
 
-def kill_all_matching(name):
+def kill_all_matching(name, force=False):
     print("Kill all:", name)
     assert name.count(" ") == 0
-    os.system(f"killall -9 {name} > /dev/null 2>&1")
+    cmd = "killall -9" if force else "killall"
+    cmd += f" {name} > /dev/null 2>&1"
+    os.system(cmd)
 
 
 MANAGER_SRV_PORT = 52600
@@ -155,8 +158,8 @@ def launch_servers(protocol, num_replicas, release, config):
     args = parser.parse_args()
 
     # kill all existing server and manager processes
-    kill_all_matching("summerset_server")
-    kill_all_matching("summerset_manager")
+    kill_all_matching("summerset_server", force=True)
+    kill_all_matching("summerset_manager", force=True)
 
     # remove all existing wal files
     for path in Path("/tmp").glob("summerset.*.wal"):
@@ -170,11 +173,27 @@ def launch_servers(protocol, num_replicas, release, config):
     wait_manager_setup(manager_proc)
 
     # then launch server replicas
-    launch_servers(args.protocol, args.num_replicas, args.release, args.config)
+    server_procs = launch_servers(
+        args.protocol, args.num_replicas, args.release, args.config
+    )
+
+    # register termination signals handler
+    def kill_spawned_procs(*args):
+        for proc in server_procs:
+            proc.terminate()
+        for proc in server_procs:
+            proc.wait()
+        manager_proc.terminate()
+
+    signal.signal(signal.SIGINT, kill_spawned_procs)
+    signal.signal(signal.SIGTERM, kill_spawned_procs)
+    signal.signal(signal.SIGHUP, kill_spawned_procs)
 
+    # since we piped manager proc's output, re-print it out
     for line in iter(manager_proc.stderr.readline, b""):
         sys.stderr.buffer.write(line)
         sys.stderr.flush()
 
+    # reaches here after manager proc has terminated
     rc = manager_proc.wait()
     sys.exit(rc)
diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index 15c7372e..e0c6f842 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -11,6 +11,8 @@ use crate::server::ReplicaId;
 use crate::client::ClientId;
 use crate::protocols::SmrProtocol;
 
+use tokio::sync::mpsc;
+
 /// Information about an active server.
 // TODO: maybe add things like leader info, etc.
 #[derive(Debug, Clone)]
@@ -73,8 +75,17 @@ impl ClusterManager {
         })
     }
 
-    /// Main event loop logic of the cluster manager.
-    pub async fn run(&mut self) {
+    /// Main event loop logic of the cluster manager. Breaks out of the loop
+    /// only upon catching termination signals to the process.
+    pub async fn run(&mut self) -> Result<(), SummersetError> {
+        // set up termination signals handler
+        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
+        ctrlc::set_handler(move || {
+            if let Err(e) = tx_term.send(true) {
+                pf_error!("m"; "error sending to term channel: {}", e);
+            }
+        })?;
+
         loop {
             tokio::select! {
                 // receiving server control message
@@ -102,8 +113,16 @@ impl ClusterManager {
                                        client, e);
                     }
                 },
+
+                // receiving termination signal
+                _ = rx_term.recv() => {
+                    pf_warn!("m"; "manager caught termination signal");
+                    break;
+                }
             }
         }
+
+        Ok(())
     }
 }
 
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index da59f976..db359123 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -22,6 +22,7 @@ use async_trait::async_trait;
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
+use tokio::sync::mpsc;
 
 use reed_solomon_erasure::galois_8::ReedSolomon;
 
@@ -1215,7 +1216,15 @@ impl GenericReplica for CrosswordReplica {
         })
     }
 
-    async fn run(&mut self) {
+    async fn run(&mut self) -> Result<bool, SummersetError> {
+        // set up termination signals handler
+        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
+        ctrlc::set_handler(move || {
+            if let Err(e) = tx_term.send(true) {
+                pf_error!("s"; "error sending to term channel: {}", e);
+            }
+        })?;
+
         // TODO: proper leader election
         if self.id == 0 {
             self.is_leader = true;
@@ -1282,6 +1291,12 @@ impl GenericReplica for CrosswordReplica {
                     if let Err(e) = self.handle_ctrl_msg(ctrl_msg) {
                         pf_error!(self.id; "error handling ctrl msg: {}", e);
                     }
+                },
+
+                // receiving termination signal
+                _ = rx_term.recv() => {
+                    pf_warn!(self.id; "server caught termination signal");
+                    return Ok(false);
                 }
             }
         }
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index d44056f1..5ed71b3d 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -26,6 +26,7 @@ use async_trait::async_trait;
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
+use tokio::sync::mpsc;
 
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
@@ -1010,7 +1011,15 @@ impl GenericReplica for MultiPaxosReplica {
         })
     }
 
-    async fn run(&mut self) {
+    async fn run(&mut self) -> Result<bool, SummersetError> {
+        // set up termination signals handler
+        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
+        ctrlc::set_handler(move || {
+            if let Err(e) = tx_term.send(true) {
+                pf_error!("s"; "error sending to term channel: {}", e);
+            }
+        })?;
+
         // TODO: proper leader election
         if self.id == 0 {
             self.is_leader = true;
@@ -1077,6 +1086,12 @@ impl GenericReplica for MultiPaxosReplica {
                     if let Err(e) = self.handle_ctrl_msg(ctrl_msg) {
                         pf_error!(self.id; "error handling ctrl msg: {}", e);
                     }
+                },
+
+                // receiving termination signal
+                _ = rx_term.recv() => {
+                    pf_warn!(self.id; "server caught termination signal");
+                    return Ok(false);
                 }
             }
         }
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index b071253b..f4ea852c 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -21,6 +21,7 @@ use async_trait::async_trait;
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
+use tokio::sync::mpsc;
 
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
@@ -295,7 +296,15 @@ impl GenericReplica for RepNothingReplica {
         })
     }
 
-    async fn run(&mut self) {
+    async fn run(&mut self) -> Result<bool, SummersetError> {
+        // set up termination signals handler
+        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
+        ctrlc::set_handler(move || {
+            if let Err(e) = tx_term.send(true) {
+                pf_error!("s"; "error sending to term channel: {}", e);
+            }
+        })?;
+
         loop {
             tokio::select! {
                 // client request batch
@@ -344,6 +353,12 @@ impl GenericReplica for RepNothingReplica {
                     if let Err(e) = self.handle_ctrl_msg(ctrl_msg) {
                         pf_error!(self.id; "error handling ctrl msg: {}", e);
                     }
+                },
+
+                // receiving termination signal
+                _ = rx_term.recv() => {
+                    pf_warn!(self.id; "server caught termination signal");
+                    return Ok(false);
                 }
             }
         }
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index b2da668d..ce3ccc85 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -22,6 +22,7 @@ use async_trait::async_trait;
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
+use tokio::sync::mpsc;
 
 use reed_solomon_erasure::galois_8::ReedSolomon;
 
@@ -1108,7 +1109,15 @@ impl GenericReplica for RSPaxosReplica {
         })
     }
 
-    async fn run(&mut self) {
+    async fn run(&mut self) -> Result<bool, SummersetError> {
+        // set up termination signals handler
+        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
+        ctrlc::set_handler(move || {
+            if let Err(e) = tx_term.send(true) {
+                pf_error!("s"; "error sending to term channel: {}", e);
+            }
+        })?;
+
         // TODO: proper leader election
         if self.id == 0 {
             self.is_leader = true;
@@ -1175,6 +1184,12 @@ impl GenericReplica for RSPaxosReplica {
                     if let Err(e) = self.handle_ctrl_msg(ctrl_msg) {
                         pf_error!(self.id; "error handling ctrl msg: {}", e);
                     }
+                },
+
+                // receiving termination signal
+                _ = rx_term.recv() => {
+                    pf_warn!(self.id; "server caught termination signal");
+                    return Ok(false);
                 }
             }
         }
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index eb082de3..841d8c20 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -22,6 +22,7 @@ use async_trait::async_trait;
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
+use tokio::sync::mpsc;
 
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
@@ -470,7 +471,15 @@ impl GenericReplica for SimplePushReplica {
         })
     }
 
-    async fn run(&mut self) {
+    async fn run(&mut self) -> Result<bool, SummersetError> {
+        // set up termination signals handler
+        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
+        ctrlc::set_handler(move || {
+            if let Err(e) = tx_term.send(true) {
+                pf_error!("s"; "error sending to term channel: {}", e);
+            }
+        })?;
+
         loop {
             tokio::select! {
                 // client request batch
@@ -541,6 +550,12 @@ impl GenericReplica for SimplePushReplica {
                     if let Err(e) = self.handle_ctrl_msg(ctrl_msg) {
                         pf_error!(self.id; "error handling ctrl msg: {}", e);
                     }
+                },
+
+                // receiving termination signal
+                _ = rx_term.recv() => {
+                    pf_warn!(self.id; "server caught termination signal");
+                    return Ok(false);
                 }
             }
         }
diff --git a/src/server/replica.rs b/src/server/replica.rs
index 174e60a2..cae9d042 100644
--- a/src/server/replica.rs
+++ b/src/server/replica.rs
@@ -24,6 +24,9 @@ pub trait GenericReplica {
     where
         Self: Sized;
 
-    /// Main event loop logic of running this replica.
-    async fn run(&mut self);
+    /// Main event loop logic of running this replica. Returns `Ok(true)` if
+    /// terminated normally and wants to restart (e.g., receiving a reset
+    /// control message) or `Ok(false)` if terminated normally and does not
+    /// want to restart (e.g., receiving a termination signal).
+    async fn run(&mut self) -> Result<bool, SummersetError>;
 }
diff --git a/src/server/transport.rs b/src/server/transport.rs
index ca121d70..504e32a2 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -370,7 +370,7 @@ where
                 to_connect = rx_connect.recv() => {
                     if to_connect.is_none() {
                         pf_error!(me; "connect channel closed");
-                        continue;
+                        break; // channel gets closed and no messages remain
                     }
                     let (peer, addr) = to_connect.unwrap();
                     if let Err(e) = Self::connect_new_peer(
diff --git a/src/utils/error.rs b/src/utils/error.rs
index cdef3b56..90e576c5 100644
--- a/src/utils/error.rs
+++ b/src/utils/error.rs
@@ -45,6 +45,7 @@ impl_from_error!(
     tokio::sync::mpsc::error::SendError<(ReplicaId, net::SocketAddr)>
 );
 impl_from_error!(reed_solomon_erasure::Error);
+impl_from_error!(ctrlc::Error);
 
 #[cfg(test)]
 mod error_tests {
diff --git a/summerset_client/src/main.rs b/summerset_client/src/main.rs
index bba5eae3..26346720 100644
--- a/summerset_client/src/main.rs
+++ b/summerset_client/src/main.rs
@@ -10,7 +10,7 @@ use env_logger::Env;
 use tokio::runtime::Builder;
 use tokio::time::Duration;
 
-use summerset::{SmrProtocol, SummersetError, pf_error};
+use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error};
 
 mod drivers;
 mod clients;
@@ -160,6 +160,7 @@ fn main() -> ExitCode {
         pf_error!("c"; "client_main exitted: {}", e);
         ExitCode::FAILURE
     } else {
+        pf_warn!("c"; "client_main exitted successfully");
         ExitCode::SUCCESS
     }
 }
diff --git a/summerset_manager/src/main.rs b/summerset_manager/src/main.rs
index 87fc1d61..6b886372 100644
--- a/summerset_manager/src/main.rs
+++ b/summerset_manager/src/main.rs
@@ -9,7 +9,7 @@ use env_logger::Env;
 
 use tokio::runtime::Builder;
 
-use summerset::{SmrProtocol, SummersetError, pf_error};
+use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error};
 
 /// Command line arguments definition.
 #[derive(Parser, Debug)]
@@ -113,7 +113,7 @@ fn manager_main() -> Result<(), SummersetError> {
             .new_cluster_manager_setup(srv_addr, cli_addr, args.population)
             .await?;
 
-        manager.run().await;
+        manager.run().await?;
 
         Ok::<(), SummersetError>(()) // give type hint for this async closure
     })
@@ -130,6 +130,7 @@ fn main() -> ExitCode {
         pf_error!("m"; "manager_main exitted: {}", e);
         ExitCode::FAILURE
     } else {
+        pf_warn!("m"; "manager_main exitted successfully");
         ExitCode::SUCCESS
     }
 }
diff --git a/summerset_server/src/main.rs b/summerset_server/src/main.rs
index df3a736b..abbbc20d 100644
--- a/summerset_server/src/main.rs
+++ b/summerset_server/src/main.rs
@@ -2,6 +2,8 @@
 
 use std::net::SocketAddr;
 use std::process::ExitCode;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
 
 use clap::Parser;
 
@@ -9,7 +11,7 @@ use env_logger::Env;
 
 use tokio::runtime::Builder;
 
-use summerset::{SmrProtocol, SummersetError, pf_error};
+use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error};
 
 /// Command line arguments definition.
 #[derive(Parser, Debug)]
@@ -107,29 +109,44 @@ fn server_main() -> Result<(), SummersetError> {
         Some(&args.config[..])
     };
 
-    // create tokio multi-threaded runtime
-    let runtime = Builder::new_multi_thread()
-        .enable_all()
-        .worker_threads(args.threads)
-        .thread_name("tokio-worker-replica")
-        .build()?;
-
-    // enter tokio runtime, setup the server replica, and start the main event
-    // loop logic
-    runtime.block_on(async move {
-        let mut replica = protocol
-            .new_server_replica_setup(
-                api_addr,
-                p2p_addr,
-                args.manager,
-                config_str,
-            )
-            .await?;
-
-        replica.run().await;
-
-        Ok::<(), SummersetError>(()) // give type hint for this async closure
-    })
+    let shutdown = Arc::new(AtomicBool::new(false));
+    while !shutdown.load(Ordering::SeqCst) {
+        let sd = shutdown.clone();
+
+        // create tokio multi-threaded runtime
+        let runtime = Builder::new_multi_thread()
+            .enable_all()
+            .worker_threads(args.threads)
+            .thread_name("tokio-worker-replica")
+            .build()?;
+
+        // enter tokio runtime, setup the server replica, and start the main
+        // event loop logic
+        runtime.block_on(async move {
+            let mut replica = protocol
+                .new_server_replica_setup(
+                    api_addr,
+                    p2p_addr,
+                    args.manager,
+                    config_str,
+                )
+                .await?;
+
+            if replica.run().await? {
+                // event loop terminated but wants to restart (e.g., when
+                // receiving a reset control message); just drop this runtime
+                // and move to the next iteration of loop
+            } else {
+                // event loop terminated and does not want to restart (e.g.,
+                // when receiving a termination signal)
+                sd.store(true, Ordering::SeqCst);
+            }
+
+            Ok::<(), SummersetError>(()) // give type hint for this async closure
+        })?;
+    }
+
+    Ok(())
 }
 
 fn main() -> ExitCode {
@@ -143,6 +160,7 @@ fn main() -> ExitCode {
         pf_error!("s"; "server_main exitted: {}", e);
         ExitCode::FAILURE
     } else {
+        pf_warn!("s"; "server_main exitted successfully");
         ExitCode::SUCCESS
     }
 }

From f7d71d45aafc69bee22a11ad6d4b5ae10c5cfe51 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 17:33:21 +0800
Subject: [PATCH 15/89] fixing scripts address already in use

---
 scripts/local_bench.tmp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 03d9e9ba..a76b77ea 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -44,7 +44,7 @@ def wait_cluster_setup(proc, num_replicas):
 
     for line in iter(proc.stderr.readline, b""):
         l = line.decode()
-        print(l, end="")
+        # print(l, end="")
         if "manager" not in l and "accepting clients" in l:
             replica = int(l[l.find("(") + 1 : l.find(")")])
             assert not accepting_clients[replica]

From edbd4f5386a30d5eb0fa63c51632e840be71cd72 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 18:46:26 +0800
Subject: [PATCH 16/89] fix wrong NewServerJoin message send timing

---
 src/manager/clusman.rs       |  3 ++-
 src/manager/reigner.rs       | 15 ++++++++++--
 src/protocols/crossword.rs   | 45 ++++++++++++++++++++----------------
 src/protocols/multipaxos.rs  | 43 +++++++++++++++++++---------------
 src/protocols/rep_nothing.rs | 26 ++++++++++++---------
 src/protocols/rs_paxos.rs    | 43 +++++++++++++++++++---------------
 src/protocols/simple_push.rs | 43 +++++++++++++++++++---------------
 src/server/control.rs        |  9 ++++++--
 8 files changed, 134 insertions(+), 93 deletions(-)

diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index e0c6f842..8890faa9 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -61,7 +61,8 @@ impl ClusterManager {
             return logged_err!("m"; "invalid population {}", population);
         }
 
-        let server_reigner = ServerReigner::new_and_setup(srv_addr).await?;
+        let server_reigner =
+            ServerReigner::new_and_setup(srv_addr, population).await?;
         let client_reactor = ClientReactor::new_and_setup(cli_addr).await?;
 
         Ok(ClusterManager {
diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs
index cff8f18f..ef7d7579 100644
--- a/src/manager/reigner.rs
+++ b/src/manager/reigner.rs
@@ -64,6 +64,7 @@ impl ServerReigner {
     /// messages.
     pub async fn new_and_setup(
         srv_addr: SocketAddr,
+        population: u8,
     ) -> Result<Self, SummersetError> {
         let (tx_recv, rx_recv) = mpsc::unbounded_channel();
 
@@ -76,6 +77,7 @@ impl ServerReigner {
         let server_listener = tcp_bind_with_retry(srv_addr, 10).await?;
         let server_acceptor_handle =
             tokio::spawn(Self::server_acceptor_thread(
+                population,
                 tx_recv,
                 server_listener,
                 tx_sends_write,
@@ -128,10 +130,12 @@ impl ServerReigner {
 // ServerReigner server_acceptor thread implementation
 impl ServerReigner {
     /// Accepts a new server connection.
+    #[allow(clippy::too_many_arguments)]
     async fn accept_new_server(
         mut stream: TcpStream,
         addr: SocketAddr,
         id: ReplicaId,
+        population: u8,
         tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>,
         tx_sends: &mut flashmap::WriteHandle<
             ReplicaId,
@@ -143,11 +147,16 @@ impl ServerReigner {
         >,
         tx_exit: mpsc::UnboundedSender<ReplicaId>,
     ) -> Result<(), SummersetError> {
-        // send ID assignment
+        // first send server ID assignment
         if let Err(e) = stream.write_u8(id).await {
             return logged_err!("m"; "error assigning new server ID: {}", e);
         }
 
+        // then send population
+        if let Err(e) = stream.write_u8(population).await {
+            return logged_err!("m"; "error sending population: {}", e);
+        }
+
         let mut tx_sends_guard = tx_sends.guard();
         if let Some(sender) = tx_sends_guard.get(&id) {
             if sender.is_closed() {
@@ -205,6 +214,7 @@ impl ServerReigner {
 
     /// Server acceptor thread function.
     async fn server_acceptor_thread(
+        population: u8,
         tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>,
         server_listener: TcpListener,
         mut tx_sends: flashmap::WriteHandle<
@@ -241,6 +251,7 @@ impl ServerReigner {
                         stream,
                         addr,
                         next_server_id,
+                        population,
                         tx_recv.clone(),
                         &mut tx_sends,
                         &mut server_controller_handles,
@@ -471,7 +482,7 @@ mod reigner_tests {
         });
         // manager
         let mut reigner =
-            ServerReigner::new_and_setup("127.0.0.1:53600".parse()?).await?;
+            ServerReigner::new_and_setup("127.0.0.1:53600".parse()?, 2).await?;
         setup_bar.wait().await;
         // recv message from server 0
         let (id, msg) = reigner.recv_ctrl().await?;
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index db359123..dc74afe8 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -1116,14 +1116,16 @@ impl GenericReplica for CrosswordReplica {
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
-        let config = parsed_config!(config_str => ReplicaConfigCrossword;
-                                    batch_interval_us, max_batch_size,
-                                    backer_path, logger_sync, fault_tolerance,
-                                    shards_per_replica)?;
         // connect to the cluster manager and get assigned a server ID
         let mut control_hub = ControlHub::new_and_setup(manager).await?;
         let id = control_hub.me;
+        let population = control_hub.population;
 
+        // parse protocol-specific configs
+        let config = parsed_config!(config_str => ReplicaConfigCrossword;
+                                    batch_interval_us, max_batch_size,
+                                    backer_path, logger_sync, fault_tolerance,
+                                    shards_per_replica)?;
         if config.batch_interval_us == 0 {
             return logged_err!(
                 id;
@@ -1132,20 +1134,31 @@ impl GenericReplica for CrosswordReplica {
             );
         }
 
-        // ask for population number and the list of peers to proactively
-        // connect to
+        // setup state machine module
+        let state_machine = StateMachine::new_and_setup(id).await?;
+
+        // setup storage hub module
+        let storage_hub =
+            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
+                .await?;
+
+        // setup transport hub module
+        let mut transport_hub =
+            TransportHub::new_and_setup(id, population, p2p_addr).await?;
+
+        // ask for the list of peers to proactively connect to. Do this after
+        // transport hub has been set up, so that I will be able to accept
+        // later peer connections
         control_hub.send_ctrl(CtrlMsg::NewServerJoin {
             id,
             protocol: SmrProtocol::Crossword,
             api_addr,
             p2p_addr,
         })?;
-        let (population, to_peers) = if let CtrlMsg::ConnectToPeers {
-            population,
-            to_peers,
-        } = control_hub.recv_ctrl().await?
+        let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } =
+            control_hub.recv_ctrl().await?
         {
-            (population, to_peers)
+            to_peers
         } else {
             return logged_err!(id; "unexpected ctrl msg type received");
         };
@@ -1168,15 +1181,6 @@ impl GenericReplica for CrosswordReplica {
             (population - quorum_cnt) as usize,
         )?;
 
-        let state_machine = StateMachine::new_and_setup(id).await?;
-
-        let storage_hub =
-            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
-                .await?;
-
-        let mut transport_hub =
-            TransportHub::new_and_setup(id, population, p2p_addr).await?;
-
         // proactively connect to some peers, then wait for all population
         // have been connected with me
         for (peer, addr) in to_peers {
@@ -1184,6 +1188,7 @@ impl GenericReplica for CrosswordReplica {
         }
         transport_hub.wait_for_group(population).await?;
 
+        // setup external API module, ready to take in client requests
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 5ed71b3d..4e50c9a3 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -931,13 +931,15 @@ impl GenericReplica for MultiPaxosReplica {
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
-        let config = parsed_config!(config_str => ReplicaConfigMultiPaxos;
-                                    batch_interval_us, max_batch_size,
-                                    backer_path, logger_sync)?;
         // connect to the cluster manager and get assigned a server ID
         let mut control_hub = ControlHub::new_and_setup(manager).await?;
         let id = control_hub.me;
+        let population = control_hub.population;
 
+        // parse protocol-specific configs
+        let config = parsed_config!(config_str => ReplicaConfigMultiPaxos;
+                                    batch_interval_us, max_batch_size,
+                                    backer_path, logger_sync)?;
         if config.batch_interval_us == 0 {
             return logged_err!(
                 id;
@@ -946,33 +948,35 @@ impl GenericReplica for MultiPaxosReplica {
             );
         }
 
-        // ask for population number and the list of peers to proactively
-        // connect to
+        // setup state machine module
+        let state_machine = StateMachine::new_and_setup(id).await?;
+
+        // setup storage hub module
+        let storage_hub =
+            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
+                .await?;
+
+        // setup transport hub module
+        let mut transport_hub =
+            TransportHub::new_and_setup(id, population, p2p_addr).await?;
+
+        // ask for the list of peers to proactively connect to. Do this after
+        // transport hub has been set up, so that I will be able to accept
+        // later peer connections
         control_hub.send_ctrl(CtrlMsg::NewServerJoin {
             id,
             protocol: SmrProtocol::MultiPaxos,
             api_addr,
             p2p_addr,
         })?;
-        let (population, to_peers) = if let CtrlMsg::ConnectToPeers {
-            population,
-            to_peers,
-        } = control_hub.recv_ctrl().await?
+        let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } =
+            control_hub.recv_ctrl().await?
         {
-            (population, to_peers)
+            to_peers
         } else {
             return logged_err!(id; "unexpected ctrl msg type received");
         };
 
-        let state_machine = StateMachine::new_and_setup(id).await?;
-
-        let storage_hub =
-            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
-                .await?;
-
-        let mut transport_hub =
-            TransportHub::new_and_setup(id, population, p2p_addr).await?;
-
         // proactively connect to some peers, then wait for all population
         // have been connected with me
         for (peer, addr) in to_peers {
@@ -980,6 +984,7 @@ impl GenericReplica for MultiPaxosReplica {
         }
         transport_hub.wait_for_group(population).await?;
 
+        // setup external API module, ready to take in client requests
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index f4ea852c..bbfb79c6 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -242,13 +242,14 @@ impl GenericReplica for RepNothingReplica {
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
-        let config = parsed_config!(config_str => ReplicaConfigRepNothing;
-                                    batch_interval_us, max_batch_size,
-                                    backer_path, logger_sync)?;
         // connect to the cluster manager and get assigned a server ID
         let mut control_hub = ControlHub::new_and_setup(manager).await?;
         let id = control_hub.me;
 
+        // parse protocol-specific configs
+        let config = parsed_config!(config_str => ReplicaConfigRepNothing;
+                                    batch_interval_us, max_batch_size,
+                                    backer_path, logger_sync)?;
         if config.batch_interval_us == 0 {
             return logged_err!(
                 id;
@@ -257,6 +258,16 @@ impl GenericReplica for RepNothingReplica {
             );
         }
 
+        // setup state machine module
+        let state_machine = StateMachine::new_and_setup(id).await?;
+
+        // setup storage hub module
+        let storage_hub =
+            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
+                .await?;
+
+        // TransportHub is not needed in RepNothing
+
         // tell the manager tha I have joined
         control_hub.send_ctrl(CtrlMsg::NewServerJoin {
             id,
@@ -266,14 +277,7 @@ impl GenericReplica for RepNothingReplica {
         })?;
         control_hub.recv_ctrl().await?;
 
-        let state_machine = StateMachine::new_and_setup(id).await?;
-
-        let storage_hub =
-            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
-                .await?;
-
-        // TransportHub is not needed in RepNothing
-
+        // setup external API module, ready to take in client requests
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index ce3ccc85..1c72b41a 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -1016,13 +1016,15 @@ impl GenericReplica for RSPaxosReplica {
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
-        let config = parsed_config!(config_str => ReplicaConfigRSPaxos;
-                                    batch_interval_us, max_batch_size,
-                                    backer_path, logger_sync, fault_tolerance)?;
         // connect to the cluster manager and get assigned a server ID
         let mut control_hub = ControlHub::new_and_setup(manager).await?;
         let id = control_hub.me;
+        let population = control_hub.population;
 
+        // parse protocol-specific configs
+        let config = parsed_config!(config_str => ReplicaConfigRSPaxos;
+                                    batch_interval_us, max_batch_size,
+                                    backer_path, logger_sync, fault_tolerance)?;
         if config.batch_interval_us == 0 {
             return logged_err!(
                 id;
@@ -1031,20 +1033,31 @@ impl GenericReplica for RSPaxosReplica {
             );
         }
 
-        // ask for population number and the list of peers to proactively
-        // connect to
+        // setup state machine module
+        let state_machine = StateMachine::new_and_setup(id).await?;
+
+        // setup storage hub module
+        let storage_hub =
+            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
+                .await?;
+
+        // setup transport hub module
+        let mut transport_hub =
+            TransportHub::new_and_setup(id, population, p2p_addr).await?;
+
+        // ask for the list of peers to proactively connect to. Do this after
+        // transport hub has been set up, so that I will be able to accept
+        // later peer connections
         control_hub.send_ctrl(CtrlMsg::NewServerJoin {
             id,
             protocol: SmrProtocol::RSPaxos,
             api_addr,
             p2p_addr,
         })?;
-        let (population, to_peers) = if let CtrlMsg::ConnectToPeers {
-            population,
-            to_peers,
-        } = control_hub.recv_ctrl().await?
+        let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } =
+            control_hub.recv_ctrl().await?
         {
-            (population, to_peers)
+            to_peers
         } else {
             return logged_err!(id; "unexpected ctrl msg type received");
         };
@@ -1061,15 +1074,6 @@ impl GenericReplica for RSPaxosReplica {
             (population - quorum_cnt) as usize,
         )?;
 
-        let state_machine = StateMachine::new_and_setup(id).await?;
-
-        let storage_hub =
-            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
-                .await?;
-
-        let mut transport_hub =
-            TransportHub::new_and_setup(id, population, p2p_addr).await?;
-
         // proactively connect to some peers, then wait for all population
         // have been connected with me
         for (peer, addr) in to_peers {
@@ -1077,6 +1081,7 @@ impl GenericReplica for RSPaxosReplica {
         }
         transport_hub.wait_for_group(population).await?;
 
+        // setup external API module, ready to take in client requests
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index 841d8c20..b0156ad3 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -398,13 +398,15 @@ impl GenericReplica for SimplePushReplica {
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
-        let config = parsed_config!(config_str => ReplicaConfigSimplePush;
-                                    batch_interval_us, max_batch_size,
-                                    backer_path, rep_degree)?;
         // connect to the cluster manager and get assigned a server ID
         let mut control_hub = ControlHub::new_and_setup(manager).await?;
         let id = control_hub.me;
+        let population = control_hub.population;
 
+        // parse protocol-specific configs
+        let config = parsed_config!(config_str => ReplicaConfigSimplePush;
+                                    batch_interval_us, max_batch_size,
+                                    backer_path, rep_degree)?;
         if config.batch_interval_us == 0 {
             return logged_err!(
                 id;
@@ -413,33 +415,35 @@ impl GenericReplica for SimplePushReplica {
             );
         }
 
-        // ask for population number and the list of peers to proactively
-        // connect to
+        // setup state machine module
+        let state_machine = StateMachine::new_and_setup(id).await?;
+
+        // setup storage hub module
+        let storage_hub =
+            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
+                .await?;
+
+        // setup transport hub module
+        let mut transport_hub =
+            TransportHub::new_and_setup(id, population, p2p_addr).await?;
+
+        // ask for the list of peers to proactively connect to. Do this after
+        // transport hub has been set up, so that I will be able to accept
+        // later peer connections
         control_hub.send_ctrl(CtrlMsg::NewServerJoin {
             id,
             protocol: SmrProtocol::SimplePush,
             api_addr,
             p2p_addr,
         })?;
-        let (population, to_peers) = if let CtrlMsg::ConnectToPeers {
-            population,
-            to_peers,
-        } = control_hub.recv_ctrl().await?
+        let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } =
+            control_hub.recv_ctrl().await?
         {
-            (population, to_peers)
+            to_peers
         } else {
             return logged_err!(id; "unexpected ctrl msg type received");
         };
 
-        let state_machine = StateMachine::new_and_setup(id).await?;
-
-        let storage_hub =
-            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
-                .await?;
-
-        let mut transport_hub =
-            TransportHub::new_and_setup(id, population, p2p_addr).await?;
-
         // proactively connect to some peers, then wait for all population
         // have been connected with me
         for (peer, addr) in to_peers {
@@ -447,6 +451,7 @@ impl GenericReplica for SimplePushReplica {
         }
         transport_hub.wait_for_group(population).await?;
 
+        // setup external API module, ready to take in client requests
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
diff --git a/src/server/control.rs b/src/server/control.rs
index 68b812fd..05627db3 100644
--- a/src/server/control.rs
+++ b/src/server/control.rs
@@ -19,6 +19,9 @@ pub struct ControlHub {
     /// My replica ID.
     pub me: ReplicaId,
 
+    /// Number of replicas in cluster.
+    pub population: u8,
+
     /// Receiver side of the recv channel.
     rx_recv: mpsc::UnboundedReceiver<CtrlMsg>,
 
@@ -42,8 +45,9 @@ impl ControlHub {
         // connect to the cluster manager and receive my assigned server ID
         pf_info!("s"; "connecting to manager '{}'...", manager);
         let mut stream = TcpStream::connect(manager).await?;
-        let id = stream.read_u8().await?; // receive my server ID
-        pf_debug!(id; "assigned server ID: {}", id);
+        let id = stream.read_u8().await?; // first receive assigned server ID
+        let population = stream.read_u8().await?; // then receive population
+        pf_debug!(id; "assigned server ID: {} of {}", id, population);
 
         let (tx_recv, rx_recv) = mpsc::unbounded_channel();
         let (tx_send, rx_send) = mpsc::unbounded_channel();
@@ -54,6 +58,7 @@ impl ControlHub {
 
         Ok(ControlHub {
             me: id,
+            population,
             rx_recv,
             tx_send,
             _control_messenger_handle: control_messenger_handle,

From f1295e8dfb3dc1154cbdbb9c0ad27f0f19551cab Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 18:59:34 +0800
Subject: [PATCH 17/89] minor updates to README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3ccadae9..22eb30f3 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ git checkout -b <PR_name>
 git branch --set-upstream-to=private/main <PR_name>
 git pull private
 git push origin <PR_name>
-# then, on GitHub, make a PR from <PR_name> branch to main
+# then, on GitHub, make a squashing PR from <PR_name> branch to main
 ```
 
 # Summerset

From 056f385bb73a3dabad4e0cc643fe8ed3ca562d52 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 20:18:56 +0800
Subject: [PATCH 18/89] staging progress on reset control message

---
 src/manager/reactor.rs | 13 ++++++++++++-
 src/manager/reigner.rs |  6 +++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs
index e3a1b198..3aba3ea2 100644
--- a/src/manager/reactor.rs
+++ b/src/manager/reactor.rs
@@ -1,6 +1,6 @@
 //! Cluster manager client-facing reactor module implementation.
 
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::net::SocketAddr;
 
 use crate::utils::{
@@ -26,6 +26,14 @@ pub enum CtrlRequest {
     /// Query the set of active servers and their info.
     QueryInfo,
 
+    /// Reset the specified server(s) to initial state.
+    ResetServer {
+        /// ID of server to reset. If `None`, resets all active servers.
+        server: Option<ReplicaId>,
+        /// If false, cleans durable storage state as well.
+        durable: bool,
+    },
+
     /// Client leave notification.
     Leave,
 }
@@ -38,6 +46,9 @@ pub enum CtrlReply {
         servers: HashMap<ReplicaId, SocketAddr>,
     },
 
+    /// Reply to server reset request.
+    ResetServer { servers: HashSet<ReplicaId> },
+
     /// Reply to client leave notification.
     Leave,
 }
diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs
index ef7d7579..459918b2 100644
--- a/src/manager/reigner.rs
+++ b/src/manager/reigner.rs
@@ -21,7 +21,7 @@ use tokio::task::JoinHandle;
 
 /// Control message from/to servers. Control traffic could be bidirectional:
 /// some initiated by the manager and some by servers.
-// TODO: add reset, pause, resume, server leave, leader change, etc.
+// TODO: add pause, resume, server leave, leader change, etc.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub enum CtrlMsg {
     /// Server -> Manager: new server up, requesting a list of peers' addresses
@@ -38,6 +38,10 @@ pub enum CtrlMsg {
         population: u8,
         to_peers: HashMap<ReplicaId, SocketAddr>,
     },
+
+    /// Manager -> Server: reset to initial state. If durable is false, cleans
+    /// durable storage state as well.
+    ResetState { durable: bool },
 }
 
 /// The server-facing controller API module.

From bc2e22f175dc66cd3a6fbd8da212f3887474cb31 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 30 Aug 2023 21:17:32 +0800
Subject: [PATCH 19/89] add tcp_connect wrapper; better server ID assign logic

---
 src/client/apistub.rs   |  7 ++++---
 src/manager/clusman.rs  | 40 +++++++++++++++++++++++++++++++++--
 src/manager/reigner.rs  | 46 +++++++++++++++++++++++++++--------------
 src/server/control.rs   |  6 ++++--
 src/server/transport.rs |  3 ++-
 src/utils/error.rs      |  4 +++-
 src/utils/mod.rs        |  4 +++-
 src/utils/safetcp.rs    | 23 ++++++++++++++++++---
 8 files changed, 104 insertions(+), 29 deletions(-)

diff --git a/src/client/apistub.rs b/src/client/apistub.rs
index 8106f7f7..ea0bb14f 100644
--- a/src/client/apistub.rs
+++ b/src/client/apistub.rs
@@ -2,13 +2,14 @@
 
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write};
+use crate::utils::{
+    SummersetError, safe_tcp_read, safe_tcp_write, tcp_connect_with_retry,
+};
 use crate::server::{ApiRequest, ApiReply};
 use crate::client::ClientId;
 
 use bytes::BytesMut;
 
-use tokio::net::TcpStream;
 use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf};
 use tokio::io::AsyncWriteExt;
 
@@ -40,7 +41,7 @@ impl ClientApiStub {
         addr: SocketAddr,
     ) -> Result<Self, SummersetError> {
         pf_info!(id; "connecting to server '{}'...", addr);
-        let mut stream = TcpStream::connect(addr).await?;
+        let mut stream = tcp_connect_with_retry(addr, 10).await?;
         stream.write_u64(id).await?; // send my client ID
         let (read_half, write_half) = stream.into_split();
 
diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index 8890faa9..9a153aa3 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -1,6 +1,6 @@
 //! Summerset cluster manager oracle implementation.
 
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::net::SocketAddr;
 
 use crate::utils::SummersetError;
@@ -41,11 +41,20 @@ pub struct ClusterManager {
     /// ServerReigner module.
     server_reigner: ServerReigner,
 
+    /// Receiver side of the server ID assignment channel.
+    rx_id_assign: mpsc::UnboundedReceiver<()>,
+
+    /// Sender side of the server ID assignment result channel.
+    tx_id_result: mpsc::UnboundedSender<(ReplicaId, u8)>,
+
     /// ClientReactor module.
     client_reactor: ClientReactor,
 
     /// Information of current active servers.
     server_info: HashMap<ReplicaId, ServerInfo>,
+
+    /// Currently assigned server IDs.
+    assigned_ids: HashSet<ReplicaId>,
 }
 
 impl ClusterManager {
@@ -61,8 +70,12 @@ impl ClusterManager {
             return logged_err!("m"; "invalid population {}", population);
         }
 
+        let (tx_id_assign, rx_id_assign) = mpsc::unbounded_channel();
+        let (tx_id_result, rx_id_result) = mpsc::unbounded_channel();
         let server_reigner =
-            ServerReigner::new_and_setup(srv_addr, population).await?;
+            ServerReigner::new_and_setup(srv_addr, tx_id_assign, rx_id_result)
+                .await?;
+
         let client_reactor = ClientReactor::new_and_setup(cli_addr).await?;
 
         Ok(ClusterManager {
@@ -71,11 +84,27 @@ impl ClusterManager {
             _cli_addr: cli_addr,
             population,
             server_reigner,
+            rx_id_assign,
+            tx_id_result,
             client_reactor,
             server_info: HashMap::new(),
+            assigned_ids: HashSet::new(),
         })
     }
 
+    /// Assign the first vacant server ID to a new server.
+    fn assign_server_id(&mut self) -> Result<(), SummersetError> {
+        for id in 0..self.population {
+            if !self.assigned_ids.contains(&id) {
+                self.tx_id_result.send((id, self.population))?;
+                self.assigned_ids.insert(id);
+                return Ok(());
+            }
+        }
+
+        logged_err!("m"; "no server ID < population left available")
+    }
+
     /// Main event loop logic of the cluster manager. Breaks out of the loop
     /// only upon catching termination signals to the process.
     pub async fn run(&mut self) -> Result<(), SummersetError> {
@@ -89,6 +118,13 @@ impl ClusterManager {
 
         loop {
             tokio::select! {
+                // receiving server ID assignment request
+                _ = self.rx_id_assign.recv() => {
+                    if let Err(e) = self.assign_server_id() {
+                        pf_error!("m"; "error assigning new server ID: {}", e);
+                    }
+                },
+
                 // receiving server control message
                 ctrl_msg = self.server_reigner.recv_ctrl() => {
                     if let Err(e) = ctrl_msg {
diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs
index 459918b2..05436ac0 100644
--- a/src/manager/reigner.rs
+++ b/src/manager/reigner.rs
@@ -64,11 +64,12 @@ pub struct ServerReigner {
 // ServerReigner public API implementation
 impl ServerReigner {
     /// Creates a new server-facing controller module. Spawns the server
-    /// acceptor thread. Creates a recv channel for buffering incoming control
-    /// messages.
+    /// acceptor thread. Creates a pair of ID assignment channels. Creates
+    /// a recv channel for buffering incoming control messages.
     pub async fn new_and_setup(
         srv_addr: SocketAddr,
-        population: u8,
+        tx_id_assign: mpsc::UnboundedSender<()>,
+        rx_id_result: mpsc::UnboundedReceiver<(ReplicaId, u8)>,
     ) -> Result<Self, SummersetError> {
         let (tx_recv, rx_recv) = mpsc::unbounded_channel();
 
@@ -81,7 +82,8 @@ impl ServerReigner {
         let server_listener = tcp_bind_with_retry(srv_addr, 10).await?;
         let server_acceptor_handle =
             tokio::spawn(Self::server_acceptor_thread(
-                population,
+                tx_id_assign,
+                rx_id_result,
                 tx_recv,
                 server_listener,
                 tx_sends_write,
@@ -138,8 +140,8 @@ impl ServerReigner {
     async fn accept_new_server(
         mut stream: TcpStream,
         addr: SocketAddr,
-        id: ReplicaId,
-        population: u8,
+        tx_id_assign: &mpsc::UnboundedSender<()>,
+        rx_id_result: &mut mpsc::UnboundedReceiver<(ReplicaId, u8)>,
         tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>,
         tx_sends: &mut flashmap::WriteHandle<
             ReplicaId,
@@ -151,6 +153,12 @@ impl ServerReigner {
         >,
         tx_exit: mpsc::UnboundedSender<ReplicaId>,
     ) -> Result<(), SummersetError> {
+        // communicate with the manager's main thread to get assigned server ID
+        tx_id_assign.send(())?;
+        let (id, population) = rx_id_result.recv().await.ok_or(
+            SummersetError("failed to get server ID assignment".into()),
+        )?;
+
         // first send server ID assignment
         if let Err(e) = stream.write_u8(id).await {
             return logged_err!("m"; "error assigning new server ID: {}", e);
@@ -218,7 +226,8 @@ impl ServerReigner {
 
     /// Server acceptor thread function.
     async fn server_acceptor_thread(
-        population: u8,
+        tx_id_assign: mpsc::UnboundedSender<()>,
+        mut rx_id_result: mpsc::UnboundedReceiver<(ReplicaId, u8)>,
         tx_recv: mpsc::UnboundedSender<(ReplicaId, CtrlMsg)>,
         server_listener: TcpListener,
         mut tx_sends: flashmap::WriteHandle<
@@ -235,9 +244,6 @@ impl ServerReigner {
         let local_addr = server_listener.local_addr().unwrap();
         pf_info!("m"; "accepting servers on '{}'", local_addr);
 
-        // maintain a monotonically increasing server ID for new servers
-        let mut next_server_id: ReplicaId = 0;
-
         // create an exit mpsc channel for getting notified about termination
         // of server controller threads
         let (tx_exit, mut rx_exit) = mpsc::unbounded_channel();
@@ -254,16 +260,14 @@ impl ServerReigner {
                     if let Err(e) = Self::accept_new_server(
                         stream,
                         addr,
-                        next_server_id,
-                        population,
+                        &tx_id_assign,
+                        &mut rx_id_result,
                         tx_recv.clone(),
                         &mut tx_sends,
                         &mut server_controller_handles,
                         tx_exit.clone(),
                     ).await {
                         pf_error!("m"; "error accepting new server: {}", e);
-                    } else {
-                        next_server_id += 1;
                     }
                 },
 
@@ -485,10 +489,18 @@ mod reigner_tests {
             Ok::<(), SummersetError>(())
         });
         // manager
-        let mut reigner =
-            ServerReigner::new_and_setup("127.0.0.1:53600".parse()?, 2).await?;
+        let (tx_id_assign, mut rx_id_assign) = mpsc::unbounded_channel();
+        let (tx_id_result, rx_id_result) = mpsc::unbounded_channel();
+        let mut reigner = ServerReigner::new_and_setup(
+            "127.0.0.1:53600".parse()?,
+            tx_id_assign,
+            rx_id_result,
+        )
+        .await?;
         setup_bar.wait().await;
         // recv message from server 0
+        rx_id_assign.recv().await;
+        tx_id_result.send((0, 2))?;
         let (id, msg) = reigner.recv_ctrl().await?;
         assert_eq!(id, 0);
         assert_eq!(
@@ -509,6 +521,8 @@ mod reigner_tests {
             id,
         )?;
         // recv message from server 1
+        rx_id_assign.recv().await;
+        tx_id_result.send((1, 2))?;
         let (id, msg) = reigner.recv_ctrl().await?;
         assert_eq!(id, 1);
         assert_eq!(
diff --git a/src/server/control.rs b/src/server/control.rs
index 05627db3..ef5ff794 100644
--- a/src/server/control.rs
+++ b/src/server/control.rs
@@ -2,7 +2,9 @@
 
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, safe_tcp_read, safe_tcp_write};
+use crate::utils::{
+    SummersetError, safe_tcp_read, safe_tcp_write, tcp_connect_with_retry,
+};
 use crate::manager::CtrlMsg;
 use crate::server::ReplicaId;
 
@@ -44,7 +46,7 @@ impl ControlHub {
     ) -> Result<Self, SummersetError> {
         // connect to the cluster manager and receive my assigned server ID
         pf_info!("s"; "connecting to manager '{}'...", manager);
-        let mut stream = TcpStream::connect(manager).await?;
+        let mut stream = tcp_connect_with_retry(manager, 10).await?;
         let id = stream.read_u8().await?; // first receive assigned server ID
         let population = stream.read_u8().await?; // then receive population
         pf_debug!(id; "assigned server ID: {} of {}", id, population);
diff --git a/src/server/transport.rs b/src/server/transport.rs
index 504e32a2..8f5f69cf 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -5,6 +5,7 @@ use std::net::SocketAddr;
 
 use crate::utils::{
     SummersetError, Bitmap, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry,
+    tcp_connect_with_retry,
 };
 use crate::server::ReplicaId;
 
@@ -253,7 +254,7 @@ where
         tx_exit: mpsc::UnboundedSender<ReplicaId>,
     ) -> Result<(), SummersetError> {
         pf_debug!(me; "connecting to peer {} '{}'...", id, addr);
-        let mut stream = TcpStream::connect(addr).await?;
+        let mut stream = tcp_connect_with_retry(addr, 10).await?;
         stream.write_u8(me).await?; // send my ID
 
         let mut peer_messenger_handles_guard = peer_messenger_handles.guard();
diff --git a/src/utils/error.rs b/src/utils/error.rs
index 90e576c5..0e73dccb 100644
--- a/src/utils/error.rs
+++ b/src/utils/error.rs
@@ -37,13 +37,15 @@ impl_from_error!(toml::ser::Error);
 impl_from_error!(toml::de::Error);
 impl_from_error!(tokio::sync::SetError<tokio::net::TcpListener>);
 impl_from_error!(tokio::sync::SetError<tokio::fs::File>);
-impl_from_error!(tokio::sync::mpsc::error::TryRecvError);
 impl_from_error!(
     tokio::sync::watch::error::SendError<Option<tokio::time::Instant>>
 );
+impl_from_error!(tokio::sync::mpsc::error::TryRecvError);
+impl_from_error!(tokio::sync::mpsc::error::SendError<()>);
 impl_from_error!(
     tokio::sync::mpsc::error::SendError<(ReplicaId, net::SocketAddr)>
 );
+impl_from_error!(tokio::sync::mpsc::error::SendError<(ReplicaId, u8)>);
 impl_from_error!(reed_solomon_erasure::Error);
 impl_from_error!(ctrlc::Error);
 
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index 23a43006..31533217 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -15,5 +15,7 @@ mod rscoding;
 pub use error::SummersetError;
 pub use bitmap::Bitmap;
 pub use timer::Timer;
-pub use safetcp::{safe_tcp_read, safe_tcp_write, tcp_bind_with_retry};
+pub use safetcp::{
+    safe_tcp_read, safe_tcp_write, tcp_bind_with_retry, tcp_connect_with_retry,
+};
 pub use rscoding::RSCodeword;
diff --git a/src/utils/safetcp.rs b/src/utils/safetcp.rs
index 6a0df26a..2c337317 100644
--- a/src/utils/safetcp.rs
+++ b/src/utils/safetcp.rs
@@ -1,6 +1,4 @@
-//! Safe TCP read/write helpers that provides cancellation safety on the read
-//! side and deadlock avoidance on the write side. Safe `TcpListener` binding
-//! wrapper that provides a retrying logic.
+//! Safe TCP bind/connect/read/write helper functions.
 
 use std::io::ErrorKind;
 use std::net::SocketAddr;
@@ -162,4 +160,23 @@ pub async fn tcp_bind_with_retry(
     }
 }
 
+/// Wrapper over tokio `TcpStream::connect()` that provides a retrying logic.
+pub async fn tcp_connect_with_retry(
+    addr: SocketAddr,
+    mut retries: u8,
+) -> Result<TcpStream, SummersetError> {
+    loop {
+        match TcpStream::connect(addr).await {
+            Ok(stream) => return Ok(stream),
+            Err(e) => {
+                if retries == 0 {
+                    return Err(e.into());
+                }
+                retries -= 1;
+                time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    }
+}
+
 // No unit tests for these helpers...

From 36d0c5bc7b57625fad4f621ef2445be54752f257 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 14:02:13 +0800
Subject: [PATCH 20/89] huge updates adding server leave and reset support

---
 Cargo.lock                                  |   2 +
 src/client/endpoint.rs                      |  18 +-
 src/lib.rs                                  |   2 +-
 src/manager/clusman.rs                      |  84 ++++++--
 src/manager/reactor.rs                      |  15 +-
 src/manager/reigner.rs                      | 140 ++++++++++++-
 src/protocols/crossword.rs                  | 164 ++++++++++-----
 src/protocols/mod.rs                        |  22 +-
 src/protocols/multipaxos.rs                 | 164 ++++++++++-----
 src/protocols/rep_nothing.rs                | 161 ++++++++++-----
 src/protocols/rs_paxos.rs                   | 164 ++++++++++-----
 src/protocols/simple_push.rs                | 164 ++++++++++-----
 src/server/external.rs                      |  15 +-
 src/server/replica.rs                       |  10 +-
 src/server/transport.rs                     | 217 ++++++++++++++++----
 summerset_client/Cargo.toml                 |   2 +-
 summerset_client/src/clients/tester.rs      |  28 ++-
 summerset_client/src/drivers/closed_loop.rs |  17 +-
 summerset_client/src/drivers/open_loop.rs   |  16 +-
 summerset_client/src/main.rs                |   5 +-
 summerset_manager/Cargo.toml                |   3 +-
 summerset_manager/src/main.rs               |  56 +++--
 summerset_server/Cargo.toml                 |   3 +-
 summerset_server/src/main.rs                |  30 ++-
 24 files changed, 1118 insertions(+), 384 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e9fc04f8..883efd71 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1151,6 +1151,7 @@ name = "summerset_manager"
 version = "0.1.0"
 dependencies = [
  "clap",
+ "ctrlc",
  "env_logger",
  "log",
  "rand",
@@ -1163,6 +1164,7 @@ name = "summerset_server"
 version = "0.1.0"
 dependencies = [
  "clap",
+ "ctrlc",
  "env_logger",
  "log",
  "rand",
diff --git a/src/client/endpoint.rs b/src/client/endpoint.rs
index bf3cd10f..37387e2b 100644
--- a/src/client/endpoint.rs
+++ b/src/client/endpoint.rs
@@ -5,6 +5,7 @@ use std::net::SocketAddr;
 
 use crate::utils::SummersetError;
 use crate::server::{ApiRequest, ApiReply};
+use crate::client::ClientCtrlStub;
 
 use async_trait::async_trait;
 
@@ -14,8 +15,9 @@ pub type ClientId = u64;
 /// Client trait to be implement by all protocol-specific client structs.
 #[async_trait]
 pub trait GenericEndpoint {
-    /// Creates a new client stub.
-    fn new(
+    /// Creates a new client stub and sets up required functionality modules
+    /// according to protocol-specific logic.
+    async fn new_and_setup(
         manager: SocketAddr, // remote address of manager oracle
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError>
@@ -23,9 +25,8 @@ pub trait GenericEndpoint {
         Self: Sized;
 
     /// Establishes connection to the service (or re-joins the service)
-    /// according to protocol-specific logic. Returns the assigned client ID
-    /// on success.
-    async fn connect(&mut self) -> Result<ClientId, SummersetError>;
+    /// according to protocol-specific logic.
+    async fn connect(&mut self) -> Result<(), SummersetError>;
 
     /// Leaves the service: forgets about the current TCP connections and send
     /// leave notifications according to protocol-specific logic. If `permanent`
@@ -40,4 +41,11 @@ pub trait GenericEndpoint {
 
     /// Receives a reply from the service according to protocol-specific logic.
     async fn recv_reply(&mut self) -> Result<ApiReply, SummersetError>;
+
+    /// Gets my client ID.
+    fn id(&self) -> ClientId;
+
+    /// Gets a mutable reference to the control stub for sending control
+    /// requests and receiving control replies for testing purposes.
+    fn ctrl_stub(&mut self) -> &mut ClientCtrlStub;
 }
diff --git a/src/lib.rs b/src/lib.rs
index 24a24bb6..2de53e51 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -25,7 +25,7 @@ pub use crate::server::{
 };
 
 #[doc(inline)]
-pub use crate::client::{ClientId, GenericEndpoint};
+pub use crate::client::{ClientId, GenericEndpoint, ClientCtrlStub};
 
 #[doc(inline)]
 pub use crate::protocols::SmrProtocol;
diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index 9a153aa3..de18afb5 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -11,7 +11,7 @@ use crate::server::ReplicaId;
 use crate::client::ClientId;
 use crate::protocols::SmrProtocol;
 
-use tokio::sync::mpsc;
+use tokio::sync::{mpsc, watch};
 
 /// Information about an active server.
 // TODO: maybe add things like leader info, etc.
@@ -107,15 +107,10 @@ impl ClusterManager {
 
     /// Main event loop logic of the cluster manager. Breaks out of the loop
     /// only upon catching termination signals to the process.
-    pub async fn run(&mut self) -> Result<(), SummersetError> {
-        // set up termination signals handler
-        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
-        ctrlc::set_handler(move || {
-            if let Err(e) = tx_term.send(true) {
-                pf_error!("m"; "error sending to term channel: {}", e);
-            }
-        })?;
-
+    pub async fn run(
+        &mut self,
+        mut rx_term: watch::Receiver<bool>,
+    ) -> Result<(), SummersetError> {
         loop {
             tokio::select! {
                 // receiving server ID assignment request
@@ -132,7 +127,7 @@ impl ClusterManager {
                         continue;
                     }
                     let (server, msg) = ctrl_msg.unwrap();
-                    if let Err(e) = self.handle_ctrl_msg(server, msg) {
+                    if let Err(e) = self.handle_ctrl_msg(server, msg).await {
                         pf_error!("m"; "error handling ctrl msg <- {}: {}",
                                        server, e);
                     }
@@ -145,14 +140,14 @@ impl ClusterManager {
                         continue;
                     }
                     let (client, req) = ctrl_req.unwrap();
-                    if let Err(e) = self.handle_ctrl_req(client, req) {
+                    if let Err(e) = self.handle_ctrl_req(client, req).await {
                         pf_error!("m"; "error handling ctrl req <- {}: {}",
                                        client, e);
                     }
                 },
 
                 // receiving termination signal
-                _ = rx_term.recv() => {
+                _ = rx_term.changed() => {
                     pf_warn!("m"; "manager caught termination signal");
                     break;
                 }
@@ -203,7 +198,7 @@ impl ClusterManager {
     }
 
     /// Synthesized handler of server-initiated control messages.
-    fn handle_ctrl_msg(
+    async fn handle_ctrl_msg(
         &mut self,
         server: ReplicaId,
         msg: CtrlMsg,
@@ -249,8 +244,62 @@ impl ClusterManager {
             .send_reply(CtrlReply::QueryInfo { servers }, client)
     }
 
+    /// Handler of client ResetServer request.
+    async fn handle_client_reset_server(
+        &mut self,
+        client: ClientId,
+        server: Option<ReplicaId>,
+        durable: bool,
+    ) -> Result<(), SummersetError> {
+        let num_replicas = self.server_info.len();
+        let mut servers: Vec<ReplicaId> = if server.is_none() {
+            // all active servers
+            self.server_info.keys().copied().collect()
+        } else {
+            vec![server.unwrap()]
+        };
+
+        // reset specified server(s)
+        let mut reset_done = HashSet::new();
+        while let Some(s) = servers.pop() {
+            // send reset server control message to server
+            self.server_reigner
+                .send_ctrl(CtrlMsg::ResetState { durable }, s)?;
+
+            // remove information about this server
+            assert!(self.assigned_ids.contains(&s));
+            assert!(self.server_info.contains_key(&s));
+            self.assigned_ids.remove(&s);
+            self.server_info.remove(&s);
+
+            // wait for the new server ID assignment request from it
+            self.rx_id_assign.recv().await;
+            if let Err(e) = self.assign_server_id() {
+                return logged_err!("m"; "error assigning new server ID: {}", e);
+            }
+
+            reset_done.insert(s);
+        }
+
+        // now the reset servers should be sending NewServerJoin messages to
+        // me. Process them until all servers joined
+        while self.server_info.len() < num_replicas {
+            let (s, msg) = self.server_reigner.recv_ctrl().await?;
+            if let Err(e) = self.handle_ctrl_msg(s, msg).await {
+                pf_error!("m"; "error handling ctrl msg <- {}: {}", s, e);
+            }
+        }
+
+        self.client_reactor.send_reply(
+            CtrlReply::ResetServer {
+                servers: reset_done,
+            },
+            client,
+        )
+    }
+
     /// Synthesized handler of client-initiated control requests.
-    fn handle_ctrl_req(
+    async fn handle_ctrl_req(
         &mut self,
         client: ClientId,
         req: CtrlRequest,
@@ -261,6 +310,11 @@ impl ClusterManager {
                 self.handle_client_query_info(client)?;
             }
 
+            CtrlRequest::ResetServer { server, durable } => {
+                self.handle_client_reset_server(client, server, durable)
+                    .await?;
+            }
+
             _ => {} // ignore all other types
         }
 
diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs
index 3aba3ea2..41a0582d 100644
--- a/src/manager/reactor.rs
+++ b/src/manager/reactor.rs
@@ -441,9 +441,11 @@ mod reactor_tests {
                 ClientReactor::new_and_setup("127.0.0.1:53601".parse()?)
                     .await?;
             barrier2.wait().await;
+            // recv request from client
             let (client, req) = reactor.recv_req().await?;
             assert!(reactor.has_client(client));
             assert_eq!(req, CtrlRequest::QueryInfo);
+            // send reply to client
             reactor.send_reply(
                 CtrlReply::QueryInfo {
                     servers: HashMap::<ReplicaId, SocketAddr>::from([
@@ -459,7 +461,9 @@ mod reactor_tests {
         barrier.wait().await;
         let mut ctrl_stub =
             ClientCtrlStub::new_by_connect("127.0.0.1:53601".parse()?).await?;
+        // send request to manager
         ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
+        // recv reply from manager
         assert_eq!(
             ctrl_stub.recv_reply().await?,
             CtrlReply::QueryInfo {
@@ -482,7 +486,9 @@ mod reactor_tests {
             let mut ctrl_stub =
                 ClientCtrlStub::new_by_connect("127.0.0.1:54601".parse()?)
                     .await?;
+            // send request to manager
             ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
+            // recv reply from manager
             assert_eq!(
                 ctrl_stub.recv_reply().await?,
                 CtrlReply::QueryInfo {
@@ -492,14 +498,17 @@ mod reactor_tests {
                     ]),
                 }
             );
+            // leave and come back as new client
             ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
             assert_eq!(ctrl_stub.recv_reply().await?, CtrlReply::Leave);
             ctrl_stub.forget();
-            time::sleep(Duration::from_millis(10)).await;
+            time::sleep(Duration::from_millis(100)).await;
             let mut ctrl_stub =
                 ClientCtrlStub::new_by_connect("127.0.0.1:54601".parse()?)
                     .await?;
+            // send request to manager
             ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
+            // recv reply from manager
             assert_eq!(
                 ctrl_stub.recv_reply().await?,
                 CtrlReply::QueryInfo {
@@ -515,9 +524,11 @@ mod reactor_tests {
         let mut reactor =
             ClientReactor::new_and_setup("127.0.0.1:54601".parse()?).await?;
         barrier.wait().await;
+        // recv request from client
         let (client, req) = reactor.recv_req().await?;
         assert!(reactor.has_client(client));
         assert_eq!(req, CtrlRequest::QueryInfo);
+        // send reply to client
         reactor.send_reply(
             CtrlReply::QueryInfo {
                 servers: HashMap::<ReplicaId, SocketAddr>::from([
@@ -527,10 +538,12 @@ mod reactor_tests {
             },
             client,
         )?;
+        // recv request from new client
         let (client2, req2) = reactor.recv_req().await?;
         assert!(reactor.has_client(client2));
         assert!(!reactor.has_client(client));
         assert_eq!(req2, CtrlRequest::QueryInfo);
+        // send reply to new client
         reactor.send_reply(
             CtrlReply::QueryInfo {
                 servers: HashMap::<ReplicaId, SocketAddr>::from([
diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs
index 05436ac0..02e4e4c3 100644
--- a/src/manager/reigner.rs
+++ b/src/manager/reigner.rs
@@ -21,7 +21,7 @@ use tokio::task::JoinHandle;
 
 /// Control message from/to servers. Control traffic could be bidirectional:
 /// some initiated by the manager and some by servers.
-// TODO: add pause, resume, server leave, leader change, etc.
+// TODO: add pause, resume, leader change, membership change, etc.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub enum CtrlMsg {
     /// Server -> Manager: new server up, requesting a list of peers' addresses
@@ -42,6 +42,12 @@ pub enum CtrlMsg {
     /// Manager -> Server: reset to initial state. If durable is false, cleans
     /// durable storage state as well.
     ResetState { durable: bool },
+
+    /// Server -> Manager: leave notification.
+    Leave,
+
+    /// Manager -> Server: dummy leave reply.
+    LeaveReply,
 }
 
 /// The server-facing controller API module.
@@ -98,6 +104,13 @@ impl ServerReigner {
         })
     }
 
+    /// Returns whether a server ID is connected to me.
+    #[allow(dead_code)]
+    pub fn has_server(&self, server: ReplicaId) -> bool {
+        let tx_sends_guard = self.tx_sends.guard();
+        tx_sends_guard.contains_key(&server)
+    }
+
     /// Waits for the next control event message from some server.
     pub async fn recv_ctrl(
         &mut self,
@@ -359,6 +372,22 @@ impl ServerReigner {
                 // receives control message from server
                 msg = Self::read_ctrl(&mut read_buf, &mut conn_read) => {
                     match msg {
+                        Ok(CtrlMsg::Leave) => {
+                            // server leaving, send dummy reply and break
+                            let msg = CtrlMsg::LeaveReply;
+                            if let Err(e) = Self::write_ctrl(
+                                &mut write_buf,
+                                &mut write_buf_cursor,
+                                &conn_write,
+                                Some(&msg)
+                            ) {
+                                pf_error!("m"; "error replying -> {}: {}", id, e);
+                            } else { // skips `WouldBlock` failure check here
+                                pf_info!("m"; "server {} has left", id);
+                            }
+                            break;
+                        },
+
                         Ok(CtrlMsg::NewServerJoin {
                             id,
                             protocol,
@@ -380,7 +409,7 @@ impl ServerReigner {
                             if let Err(e) = tx_recv.send((id, msg)) {
                                 pf_error!("m"; "error sending to tx_recv for {}: {}", id, e);
                             }
-                        }
+                        },
 
                         Ok(msg) => {
                             // pf_trace!("m"; "recv <- {} ctrl {:?}", id, msg);
@@ -432,6 +461,7 @@ mod reigner_tests {
     use std::sync::Arc;
     use crate::server::ControlHub;
     use tokio::sync::Barrier;
+    use tokio::time::{self, Duration};
 
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn api_send_recv() -> Result<(), SummersetError> {
@@ -544,4 +574,110 @@ mod reigner_tests {
         )?;
         Ok(())
     }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn api_server_leave() -> Result<(), SummersetError> {
+        let barrier = Arc::new(Barrier::new(2));
+        let barrier2 = barrier.clone();
+        tokio::spawn(async move {
+            // replica 0
+            barrier2.wait().await;
+            let mut hub =
+                ControlHub::new_and_setup("127.0.0.1:54600".parse()?).await?;
+            assert_eq!(hub.me, 0);
+            // send a message to manager
+            hub.send_ctrl(CtrlMsg::NewServerJoin {
+                id: hub.me,
+                protocol: SmrProtocol::SimplePush,
+                api_addr: "127.0.0.1:54700".parse()?,
+                p2p_addr: "127.0.0.1:54800".parse()?,
+            })?;
+            // recv a message from manager
+            assert_eq!(
+                hub.recv_ctrl().await?,
+                CtrlMsg::ConnectToPeers {
+                    population: 1,
+                    to_peers: HashMap::new(),
+                }
+            );
+            // leave and re-join as 0
+            hub.send_ctrl(CtrlMsg::Leave)?;
+            assert_eq!(hub.recv_ctrl().await?, CtrlMsg::LeaveReply);
+            time::sleep(Duration::from_millis(100)).await;
+            let mut hub =
+                ControlHub::new_and_setup("127.0.0.1:54600".parse()?).await?;
+            assert_eq!(hub.me, 0);
+            // send a message to manager
+            hub.send_ctrl(CtrlMsg::NewServerJoin {
+                id: hub.me,
+                protocol: SmrProtocol::SimplePush,
+                api_addr: "127.0.0.1:54700".parse()?,
+                p2p_addr: "127.0.0.1:54800".parse()?,
+            })?;
+            // recv a message from manager
+            assert_eq!(
+                hub.recv_ctrl().await?,
+                CtrlMsg::ConnectToPeers {
+                    population: 1,
+                    to_peers: HashMap::new(),
+                }
+            );
+            Ok::<(), SummersetError>(())
+        });
+        // manager
+        let (tx_id_assign, mut rx_id_assign) = mpsc::unbounded_channel();
+        let (tx_id_result, rx_id_result) = mpsc::unbounded_channel();
+        let mut reigner = ServerReigner::new_and_setup(
+            "127.0.0.1:54600".parse()?,
+            tx_id_assign,
+            rx_id_result,
+        )
+        .await?;
+        barrier.wait().await;
+        // recv message from server 0
+        rx_id_assign.recv().await;
+        tx_id_result.send((0, 1))?;
+        let (id, msg) = reigner.recv_ctrl().await?;
+        assert_eq!(id, 0);
+        assert_eq!(
+            msg,
+            CtrlMsg::NewServerJoin {
+                id: 0,
+                protocol: SmrProtocol::SimplePush,
+                api_addr: "127.0.0.1:54700".parse()?,
+                p2p_addr: "127.0.0.1:54800".parse()?
+            }
+        );
+        // send reply to server 0
+        reigner.send_ctrl(
+            CtrlMsg::ConnectToPeers {
+                population: 1,
+                to_peers: HashMap::new(),
+            },
+            id,
+        )?;
+        rx_id_assign.recv().await;
+        tx_id_result.send((0, 1))?;
+        // recv message from server 0
+        let (id, msg) = reigner.recv_ctrl().await?;
+        assert_eq!(id, 0);
+        assert_eq!(
+            msg,
+            CtrlMsg::NewServerJoin {
+                id: 0,
+                protocol: SmrProtocol::SimplePush,
+                api_addr: "127.0.0.1:54700".parse()?,
+                p2p_addr: "127.0.0.1:54800".parse()?
+            }
+        );
+        // send reply to server 0
+        reigner.send_ctrl(
+            CtrlMsg::ConnectToPeers {
+                population: 1,
+                to_peers: HashMap::new(),
+            },
+            id,
+        )?;
+        Ok(())
+    }
 }
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index dc74afe8..493213d4 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -22,7 +22,7 @@ use async_trait::async_trait;
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
-use tokio::sync::mpsc;
+use tokio::sync::watch;
 
 use reed_solomon_erasure::galois_8::ReedSolomon;
 
@@ -1101,11 +1101,61 @@ impl CrosswordReplica {
         Ok(())
     }
 
-    /// Synthesized handler of manager control messages.
-    fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> {
-        // TODO: fill this when more control message types added
+    /// Handler of ResetState control message.
+    async fn handle_ctrl_reset_state(
+        &mut self,
+        durable: bool,
+    ) -> Result<(), SummersetError> {
+        // send leave notification to peers and wait for their replies
+        self.transport_hub.leave().await?;
+
+        // send leave notification to manager and wait for its reply
+        self.control_hub.send_ctrl(CtrlMsg::Leave)?;
+        while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {}
+
+        // if `durable` is false, truncate backer file
+        if !durable {
+            // use 0 as a special log action ID here
+            self.storage_hub
+                .submit_action(0, LogAction::Truncate { offset: 0 })?;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id == 0 {
+                    if log_result
+                        != (LogResult::Truncate {
+                            offset_ok: true,
+                            now_size: 0,
+                        })
+                    {
+                        return logged_err!(self.id; "failed to truncate log to 0");
+                    } else {
+                        return Ok(());
+                    }
+                }
+            }
+        }
+
         Ok(())
     }
+
+    /// Synthesized handler of manager control messages. If ok, returns
+    /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
+    /// decides to shutdown completely, and `None` if not terminating.
+    async fn handle_ctrl_msg(
+        &mut self,
+        msg: CtrlMsg,
+    ) -> Result<Option<bool>, SummersetError> {
+        // TODO: fill this when more control message types added
+        match msg {
+            CtrlMsg::ResetState { durable } => {
+                self.handle_ctrl_reset_state(durable).await?;
+                Ok(Some(true))
+            }
+
+            _ => Ok(None), // ignore all other types
+        }
+    }
 }
 
 #[async_trait]
@@ -1221,15 +1271,10 @@ impl GenericReplica for CrosswordReplica {
         })
     }
 
-    async fn run(&mut self) -> Result<bool, SummersetError> {
-        // set up termination signals handler
-        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
-        ctrlc::set_handler(move || {
-            if let Err(e) = tx_term.send(true) {
-                pf_error!("s"; "error sending to term channel: {}", e);
-            }
-        })?;
-
+    async fn run(
+        &mut self,
+        mut rx_term: watch::Receiver<bool>,
+    ) -> Result<bool, SummersetError> {
         // TODO: proper leader election
         if self.id == 0 {
             self.is_leader = true;
@@ -1293,19 +1338,34 @@ impl GenericReplica for CrosswordReplica {
                         continue;
                     }
                     let ctrl_msg = ctrl_msg.unwrap();
-                    if let Err(e) = self.handle_ctrl_msg(ctrl_msg) {
-                        pf_error!(self.id; "error handling ctrl msg: {}", e);
+                    match self.handle_ctrl_msg(ctrl_msg).await {
+                        Ok(terminate) => {
+                            if let Some(restart) = terminate {
+                                pf_warn!(
+                                    self.id;
+                                    "server got {} req",
+                                    if restart { "restart" } else { "shutdown" });
+                                return Ok(restart);
+                            }
+                        },
+                        Err(e) => {
+                            pf_error!(self.id; "error handling ctrl msg: {}", e);
+                        }
                     }
                 },
 
                 // receiving termination signal
-                _ = rx_term.recv() => {
+                _ = rx_term.changed() => {
                     pf_warn!(self.id; "server caught termination signal");
                     return Ok(false);
                 }
             }
         }
     }
+
+    fn id(&self) -> ReplicaId {
+        self.id
+    }
 }
 
 /// Configuration parameters struct.
@@ -1327,9 +1387,6 @@ pub struct CrosswordClient {
     /// Client ID.
     id: ClientId,
 
-    /// Address of the cluster manager oracle.
-    manager: SocketAddr,
-
     /// Configuration parameters struct.
     _config: ClientConfigCrossword,
 
@@ -1340,7 +1397,7 @@ pub struct CrosswordClient {
     server_id: ReplicaId,
 
     /// Control API stub to the cluster manager.
-    ctrl_stub: Option<ClientCtrlStub>,
+    ctrl_stub: ClientCtrlStub,
 
     /// API stubs for communicating with servers.
     api_stub: Option<ClientApiStub>,
@@ -1348,47 +1405,43 @@ pub struct CrosswordClient {
 
 #[async_trait]
 impl GenericEndpoint for CrosswordClient {
-    fn new(
+    async fn new_and_setup(
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
+        // connect to the cluster manager and get assigned a client ID
+        let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
+        let id = ctrl_stub.id;
+
+        // parse protocol-specific configs
         let config = parsed_config!(config_str => ClientConfigCrossword;
                                     init_server_id)?;
         let init_server_id = config.init_server_id;
 
         Ok(CrosswordClient {
-            id: 255, // nil at this time
-            manager,
+            id,
             _config: config,
             servers: HashMap::new(),
             server_id: init_server_id,
-            ctrl_stub: None,
+            ctrl_stub,
             api_stub: None,
         })
     }
 
-    async fn connect(&mut self) -> Result<ClientId, SummersetError> {
+    async fn connect(&mut self) -> Result<(), SummersetError> {
         // disallow reconnection without leaving
         if self.api_stub.is_some() {
             return logged_err!(self.id; "reconnecting without leaving");
         }
 
-        // if ctrl_stubs not established yet, connect to the manager
-        if self.ctrl_stub.is_none() {
-            let ctrl_stub =
-                ClientCtrlStub::new_by_connect(self.manager).await?;
-            self.id = ctrl_stub.id;
-            self.ctrl_stub = Some(ctrl_stub);
-        }
-        let ctrl_stub = self.ctrl_stub.as_mut().unwrap();
-
         // ask the manager about the list of active servers
-        let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
+        let mut sent =
+            self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
         while !sent {
-            sent = ctrl_stub.send_req(None)?;
+            sent = self.ctrl_stub.send_req(None)?;
         }
 
-        let reply = ctrl_stub.recv_reply().await?;
+        let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
             CtrlReply::QueryInfo { servers } => {
                 // connect to the one with server ID in config
@@ -1399,7 +1452,7 @@ impl GenericEndpoint for CrosswordClient {
                 .await?;
                 self.api_stub = Some(api_stub);
                 self.servers = servers;
-                Ok(self.id)
+                Ok(())
             }
             _ => logged_err!(self.id; "unexpected reply type received"),
         }
@@ -1427,26 +1480,19 @@ impl GenericEndpoint for CrosswordClient {
 
         // if permanently leaving, send leave notification to the manager
         if permanent {
-            // disallow multiple permanent leaving
-            if self.ctrl_stub.is_none() {
-                return logged_err!(self.id; "repeated permanent leaving");
+            let mut sent =
+                self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
+            while !sent {
+                sent = self.ctrl_stub.send_req(None)?;
             }
 
-            if let Some(mut ctrl_stub) = self.ctrl_stub.take() {
-                let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
-                while !sent {
-                    sent = ctrl_stub.send_req(None)?;
+            let reply = self.ctrl_stub.recv_reply().await?;
+            match reply {
+                CtrlReply::Leave => {
+                    pf_info!(self.id; "left current manager connection");
                 }
-
-                let reply = ctrl_stub.recv_reply().await?;
-                match reply {
-                    CtrlReply::Leave => {
-                        pf_info!(self.id; "left current manager connection");
-                        ctrl_stub.forget();
-                    }
-                    _ => {
-                        return logged_err!(self.id; "unexpected reply type received");
-                    }
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
                 }
             }
         }
@@ -1492,4 +1538,12 @@ impl GenericEndpoint for CrosswordClient {
             None => logged_err!(self.id; "client is not set up"),
         }
     }
+
+    fn id(&self) -> ClientId {
+        self.id
+    }
+
+    fn ctrl_stub(&mut self) -> &mut ClientCtrlStub {
+        &mut self.ctrl_stub
+    }
 }
diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs
index 98ecf371..3aae79bf 100644
--- a/src/protocols/mod.rs
+++ b/src/protocols/mod.rs
@@ -126,26 +126,36 @@ impl SmrProtocol {
     }
 
     /// Create a client endpoint instance of this protocol on heap.
-    pub fn new_client_endpoint(
+    pub async fn new_client_endpoint(
         &self,
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Box<dyn GenericEndpoint>, SummersetError> {
         match self {
             Self::RepNothing => {
-                box_if_ok!(RepNothingClient::new(manager, config_str))
+                box_if_ok!(
+                    RepNothingClient::new_and_setup(manager, config_str).await
+                )
             }
             Self::SimplePush => {
-                box_if_ok!(SimplePushClient::new(manager, config_str))
+                box_if_ok!(
+                    SimplePushClient::new_and_setup(manager, config_str).await
+                )
             }
             Self::MultiPaxos => {
-                box_if_ok!(MultiPaxosClient::new(manager, config_str))
+                box_if_ok!(
+                    MultiPaxosClient::new_and_setup(manager, config_str).await
+                )
             }
             Self::RSPaxos => {
-                box_if_ok!(RSPaxosClient::new(manager, config_str))
+                box_if_ok!(
+                    RSPaxosClient::new_and_setup(manager, config_str).await
+                )
             }
             Self::Crossword => {
-                box_if_ok!(CrosswordClient::new(manager, config_str))
+                box_if_ok!(
+                    CrosswordClient::new_and_setup(manager, config_str).await
+                )
             }
         }
     }
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 4e50c9a3..2431ff86 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -26,7 +26,7 @@ use async_trait::async_trait;
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
-use tokio::sync::mpsc;
+use tokio::sync::watch;
 
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
@@ -916,11 +916,61 @@ impl MultiPaxosReplica {
         Ok(())
     }
 
-    /// Synthesized handler of manager control messages.
-    fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> {
-        // TODO: fill this when more control message types added
+    /// Handler of ResetState control message.
+    async fn handle_ctrl_reset_state(
+        &mut self,
+        durable: bool,
+    ) -> Result<(), SummersetError> {
+        // send leave notification to peers and wait for their replies
+        self.transport_hub.leave().await?;
+
+        // send leave notification to manager and wait for its reply
+        self.control_hub.send_ctrl(CtrlMsg::Leave)?;
+        while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {}
+
+        // if `durable` is false, truncate backer file
+        if !durable {
+            // use 0 as a special log action ID here
+            self.storage_hub
+                .submit_action(0, LogAction::Truncate { offset: 0 })?;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id == 0 {
+                    if log_result
+                        != (LogResult::Truncate {
+                            offset_ok: true,
+                            now_size: 0,
+                        })
+                    {
+                        return logged_err!(self.id; "failed to truncate log to 0");
+                    } else {
+                        return Ok(());
+                    }
+                }
+            }
+        }
+
         Ok(())
     }
+
+    /// Synthesized handler of manager control messages. If ok, returns
+    /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
+    /// decides to shutdown completely, and `None` if not terminating.
+    async fn handle_ctrl_msg(
+        &mut self,
+        msg: CtrlMsg,
+    ) -> Result<Option<bool>, SummersetError> {
+        // TODO: fill this when more control message types added
+        match msg {
+            CtrlMsg::ResetState { durable } => {
+                self.handle_ctrl_reset_state(durable).await?;
+                Ok(Some(true))
+            }
+
+            _ => Ok(None), // ignore all other types
+        }
+    }
 }
 
 #[async_trait]
@@ -1016,15 +1066,10 @@ impl GenericReplica for MultiPaxosReplica {
         })
     }
 
-    async fn run(&mut self) -> Result<bool, SummersetError> {
-        // set up termination signals handler
-        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
-        ctrlc::set_handler(move || {
-            if let Err(e) = tx_term.send(true) {
-                pf_error!("s"; "error sending to term channel: {}", e);
-            }
-        })?;
-
+    async fn run(
+        &mut self,
+        mut rx_term: watch::Receiver<bool>,
+    ) -> Result<bool, SummersetError> {
         // TODO: proper leader election
         if self.id == 0 {
             self.is_leader = true;
@@ -1088,19 +1133,34 @@ impl GenericReplica for MultiPaxosReplica {
                         continue;
                     }
                     let ctrl_msg = ctrl_msg.unwrap();
-                    if let Err(e) = self.handle_ctrl_msg(ctrl_msg) {
-                        pf_error!(self.id; "error handling ctrl msg: {}", e);
+                    match self.handle_ctrl_msg(ctrl_msg).await {
+                        Ok(terminate) => {
+                            if let Some(restart) = terminate {
+                                pf_warn!(
+                                    self.id;
+                                    "server got {} req",
+                                    if restart { "restart" } else { "shutdown" });
+                                return Ok(restart);
+                            }
+                        },
+                        Err(e) => {
+                            pf_error!(self.id; "error handling ctrl msg: {}", e);
+                        }
                     }
                 },
 
                 // receiving termination signal
-                _ = rx_term.recv() => {
+                _ = rx_term.changed() => {
                     pf_warn!(self.id; "server caught termination signal");
                     return Ok(false);
                 }
             }
         }
     }
+
+    fn id(&self) -> ReplicaId {
+        self.id
+    }
 }
 
 /// Configuration parameters struct.
@@ -1122,9 +1182,6 @@ pub struct MultiPaxosClient {
     /// Client ID.
     id: ClientId,
 
-    /// Address of the cluster manager oracle.
-    manager: SocketAddr,
-
     /// Configuration parameters struct.
     _config: ClientConfigMultiPaxos,
 
@@ -1135,7 +1192,7 @@ pub struct MultiPaxosClient {
     server_id: ReplicaId,
 
     /// Control API stub to the cluster manager.
-    ctrl_stub: Option<ClientCtrlStub>,
+    ctrl_stub: ClientCtrlStub,
 
     /// API stubs for communicating with servers.
     api_stub: Option<ClientApiStub>,
@@ -1143,47 +1200,43 @@ pub struct MultiPaxosClient {
 
 #[async_trait]
 impl GenericEndpoint for MultiPaxosClient {
-    fn new(
+    async fn new_and_setup(
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
+        // connect to the cluster manager and get assigned a client ID
+        let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
+        let id = ctrl_stub.id;
+
+        // parse protocol-specific configs
         let config = parsed_config!(config_str => ClientConfigMultiPaxos;
                                     init_server_id)?;
         let init_server_id = config.init_server_id;
 
         Ok(MultiPaxosClient {
-            id: 255, // nil at this time
-            manager,
+            id,
             _config: config,
             servers: HashMap::new(),
             server_id: init_server_id,
-            ctrl_stub: None,
+            ctrl_stub,
             api_stub: None,
         })
     }
 
-    async fn connect(&mut self) -> Result<ClientId, SummersetError> {
+    async fn connect(&mut self) -> Result<(), SummersetError> {
         // disallow reconnection without leaving
         if self.api_stub.is_some() {
             return logged_err!(self.id; "reconnecting without leaving");
         }
 
-        // if ctrl_stubs not established yet, connect to the manager
-        if self.ctrl_stub.is_none() {
-            let ctrl_stub =
-                ClientCtrlStub::new_by_connect(self.manager).await?;
-            self.id = ctrl_stub.id;
-            self.ctrl_stub = Some(ctrl_stub);
-        }
-        let ctrl_stub = self.ctrl_stub.as_mut().unwrap();
-
         // ask the manager about the list of active servers
-        let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
+        let mut sent =
+            self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
         while !sent {
-            sent = ctrl_stub.send_req(None)?;
+            sent = self.ctrl_stub.send_req(None)?;
         }
 
-        let reply = ctrl_stub.recv_reply().await?;
+        let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
             CtrlReply::QueryInfo { servers } => {
                 // connect to the one with server ID in config
@@ -1194,7 +1247,7 @@ impl GenericEndpoint for MultiPaxosClient {
                 .await?;
                 self.api_stub = Some(api_stub);
                 self.servers = servers;
-                Ok(self.id)
+                Ok(())
             }
             _ => logged_err!(self.id; "unexpected reply type received"),
         }
@@ -1222,26 +1275,19 @@ impl GenericEndpoint for MultiPaxosClient {
 
         // if permanently leaving, send leave notification to the manager
         if permanent {
-            // disallow multiple permanent leaving
-            if self.ctrl_stub.is_none() {
-                return logged_err!(self.id; "repeated permanent leaving");
+            let mut sent =
+                self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
+            while !sent {
+                sent = self.ctrl_stub.send_req(None)?;
             }
 
-            if let Some(mut ctrl_stub) = self.ctrl_stub.take() {
-                let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
-                while !sent {
-                    sent = ctrl_stub.send_req(None)?;
+            let reply = self.ctrl_stub.recv_reply().await?;
+            match reply {
+                CtrlReply::Leave => {
+                    pf_info!(self.id; "left current manager connection");
                 }
-
-                let reply = ctrl_stub.recv_reply().await?;
-                match reply {
-                    CtrlReply::Leave => {
-                        pf_info!(self.id; "left current manager connection");
-                        ctrl_stub.forget();
-                    }
-                    _ => {
-                        return logged_err!(self.id; "unexpected reply type received");
-                    }
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
                 }
             }
         }
@@ -1287,4 +1333,12 @@ impl GenericEndpoint for MultiPaxosClient {
             None => logged_err!(self.id; "client is not set up"),
         }
     }
+
+    fn id(&self) -> ClientId {
+        self.id
+    }
+
+    fn ctrl_stub(&mut self) -> &mut ClientCtrlStub {
+        &mut self.ctrl_stub
+    }
 }
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index bbfb79c6..ffbc57e1 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -21,7 +21,7 @@ use async_trait::async_trait;
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
-use tokio::sync::mpsc;
+use tokio::sync::watch;
 
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
@@ -227,11 +227,58 @@ impl RepNothingReplica {
         Ok(())
     }
 
-    /// Synthesized handler of manager control messages.
-    fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> {
-        // TODO: fill this when more control message types added
+    /// Handler of ResetState control message.
+    async fn handle_ctrl_reset_state(
+        &mut self,
+        durable: bool,
+    ) -> Result<(), SummersetError> {
+        // send leave notification to manager and wait for its reply
+        self.control_hub.send_ctrl(CtrlMsg::Leave)?;
+        while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {}
+
+        // if `durable` is false, truncate backer file
+        if !durable {
+            // use 0 as a special log action ID here
+            self.storage_hub
+                .submit_action(0, LogAction::Truncate { offset: 0 })?;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id == 0 {
+                    if log_result
+                        != (LogResult::Truncate {
+                            offset_ok: true,
+                            now_size: 0,
+                        })
+                    {
+                        return logged_err!(self.id; "failed to truncate log to 0");
+                    } else {
+                        return Ok(());
+                    }
+                }
+            }
+        }
+
         Ok(())
     }
+
+    /// Synthesized handler of manager control messages. If ok, returns
+    /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
+    /// decides to shutdown completely, and `None` if not terminating.
+    async fn handle_ctrl_msg(
+        &mut self,
+        msg: CtrlMsg,
+    ) -> Result<Option<bool>, SummersetError> {
+        // TODO: fill this when more control message types added
+        match msg {
+            CtrlMsg::ResetState { durable } => {
+                self.handle_ctrl_reset_state(durable).await?;
+                Ok(Some(true))
+            }
+
+            _ => Ok(None), // ignore all other types
+        }
+    }
 }
 
 #[async_trait]
@@ -300,15 +347,10 @@ impl GenericReplica for RepNothingReplica {
         })
     }
 
-    async fn run(&mut self) -> Result<bool, SummersetError> {
-        // set up termination signals handler
-        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
-        ctrlc::set_handler(move || {
-            if let Err(e) = tx_term.send(true) {
-                pf_error!("s"; "error sending to term channel: {}", e);
-            }
-        })?;
-
+    async fn run(
+        &mut self,
+        mut rx_term: watch::Receiver<bool>,
+    ) -> Result<bool, SummersetError> {
         loop {
             tokio::select! {
                 // client request batch
@@ -354,19 +396,34 @@ impl GenericReplica for RepNothingReplica {
                         continue;
                     }
                     let ctrl_msg = ctrl_msg.unwrap();
-                    if let Err(e) = self.handle_ctrl_msg(ctrl_msg) {
-                        pf_error!(self.id; "error handling ctrl msg: {}", e);
+                    match self.handle_ctrl_msg(ctrl_msg).await {
+                        Ok(terminate) => {
+                            if let Some(restart) = terminate {
+                                pf_warn!(
+                                    self.id;
+                                    "server got {} req",
+                                    if restart { "restart" } else { "shutdown" });
+                                return Ok(restart);
+                            }
+                        },
+                        Err(e) => {
+                            pf_error!(self.id; "error handling ctrl msg: {}", e);
+                        }
                     }
                 },
 
                 // receiving termination signal
-                _ = rx_term.recv() => {
+                _ = rx_term.changed() => {
                     pf_warn!(self.id; "server caught termination signal");
                     return Ok(false);
                 }
             }
         }
     }
+
+    fn id(&self) -> ReplicaId {
+        self.id
+    }
 }
 
 /// Configuration parameters struct.
@@ -388,14 +445,11 @@ pub struct RepNothingClient {
     /// Client ID.
     id: ClientId,
 
-    /// Address of the cluster manager oracle.
-    manager: SocketAddr,
-
     /// Configuration parameters struct.
     config: ClientConfigRepNothing,
 
     /// Control API stub to the cluster manager.
-    ctrl_stub: Option<ClientCtrlStub>,
+    ctrl_stub: ClientCtrlStub,
 
     /// API stubs for communicating with servers.
     api_stub: Option<ClientApiStub>,
@@ -403,44 +457,40 @@ pub struct RepNothingClient {
 
 #[async_trait]
 impl GenericEndpoint for RepNothingClient {
-    fn new(
+    async fn new_and_setup(
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
+        // connect to the cluster manager and get assigned a client ID
+        let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
+        let id = ctrl_stub.id;
+
+        // parse protocol-specific configs
         let config = parsed_config!(config_str => ClientConfigRepNothing;
                                     server_id)?;
 
         Ok(RepNothingClient {
-            id: 255, // nil at this time
-            manager,
+            id,
             config,
-            ctrl_stub: None,
+            ctrl_stub,
             api_stub: None,
         })
     }
 
-    async fn connect(&mut self) -> Result<ClientId, SummersetError> {
+    async fn connect(&mut self) -> Result<(), SummersetError> {
         // disallow reconnection without leaving
         if self.api_stub.is_some() {
             return logged_err!(self.id; "reconnecting without leaving");
         }
 
-        // if ctrl_stubs not established yet, connect to the manager
-        if self.ctrl_stub.is_none() {
-            let ctrl_stub =
-                ClientCtrlStub::new_by_connect(self.manager).await?;
-            self.id = ctrl_stub.id;
-            self.ctrl_stub = Some(ctrl_stub);
-        }
-        let ctrl_stub = self.ctrl_stub.as_mut().unwrap();
-
         // ask the manager about the list of active servers
-        let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
+        let mut sent =
+            self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
         while !sent {
-            sent = ctrl_stub.send_req(None)?;
+            sent = self.ctrl_stub.send_req(None)?;
         }
 
-        let reply = ctrl_stub.recv_reply().await?;
+        let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
             CtrlReply::QueryInfo { servers } => {
                 // connect to the one with server ID in config
@@ -450,7 +500,7 @@ impl GenericEndpoint for RepNothingClient {
                 )
                 .await?;
                 self.api_stub = Some(api_stub);
-                Ok(self.id)
+                Ok(())
             }
             _ => logged_err!(self.id; "unexpected reply type received"),
         }
@@ -478,26 +528,19 @@ impl GenericEndpoint for RepNothingClient {
 
         // if permanently leaving, send leave notification to the manager
         if permanent {
-            // disallow multiple permanent leaving
-            if self.ctrl_stub.is_none() {
-                return logged_err!(self.id; "repeated permanent leaving");
+            let mut sent =
+                self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
+            while !sent {
+                sent = self.ctrl_stub.send_req(None)?;
             }
 
-            if let Some(mut ctrl_stub) = self.ctrl_stub.take() {
-                let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
-                while !sent {
-                    sent = ctrl_stub.send_req(None)?;
+            let reply = self.ctrl_stub.recv_reply().await?;
+            match reply {
+                CtrlReply::Leave => {
+                    pf_info!(self.id; "left current manager connection");
                 }
-
-                let reply = ctrl_stub.recv_reply().await?;
-                match reply {
-                    CtrlReply::Leave => {
-                        pf_info!(self.id; "left current manager connection");
-                        ctrl_stub.forget();
-                    }
-                    _ => {
-                        return logged_err!(self.id; "unexpected reply type received");
-                    }
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
                 }
             }
         }
@@ -521,4 +564,12 @@ impl GenericEndpoint for RepNothingClient {
             None => logged_err!(self.id; "client is not set up"),
         }
     }
+
+    fn id(&self) -> ClientId {
+        self.id
+    }
+
+    fn ctrl_stub(&mut self) -> &mut ClientCtrlStub {
+        &mut self.ctrl_stub
+    }
 }
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 1c72b41a..ada30d45 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -22,7 +22,7 @@ use async_trait::async_trait;
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
-use tokio::sync::mpsc;
+use tokio::sync::watch;
 
 use reed_solomon_erasure::galois_8::ReedSolomon;
 
@@ -1001,11 +1001,61 @@ impl RSPaxosReplica {
         Ok(())
     }
 
-    /// Synthesized handler of manager control messages.
-    fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> {
-        // TODO: fill this when more control message types added
+    /// Handler of ResetState control message.
+    async fn handle_ctrl_reset_state(
+        &mut self,
+        durable: bool,
+    ) -> Result<(), SummersetError> {
+        // send leave notification to peers and wait for their replies
+        self.transport_hub.leave().await?;
+
+        // send leave notification to manager and wait for its reply
+        self.control_hub.send_ctrl(CtrlMsg::Leave)?;
+        while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {}
+
+        // if `durable` is false, truncate backer file
+        if !durable {
+            // use 0 as a special log action ID here
+            self.storage_hub
+                .submit_action(0, LogAction::Truncate { offset: 0 })?;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id == 0 {
+                    if log_result
+                        != (LogResult::Truncate {
+                            offset_ok: true,
+                            now_size: 0,
+                        })
+                    {
+                        return logged_err!(self.id; "failed to truncate log to 0");
+                    } else {
+                        return Ok(());
+                    }
+                }
+            }
+        }
+
         Ok(())
     }
+
+    /// Synthesized handler of manager control messages. If ok, returns
+    /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
+    /// decides to shutdown completely, and `None` if not terminating.
+    async fn handle_ctrl_msg(
+        &mut self,
+        msg: CtrlMsg,
+    ) -> Result<Option<bool>, SummersetError> {
+        // TODO: fill this when more control message types added
+        match msg {
+            CtrlMsg::ResetState { durable } => {
+                self.handle_ctrl_reset_state(durable).await?;
+                Ok(Some(true))
+            }
+
+            _ => Ok(None), // ignore all other types
+        }
+    }
 }
 
 #[async_trait]
@@ -1114,15 +1164,10 @@ impl GenericReplica for RSPaxosReplica {
         })
     }
 
-    async fn run(&mut self) -> Result<bool, SummersetError> {
-        // set up termination signals handler
-        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
-        ctrlc::set_handler(move || {
-            if let Err(e) = tx_term.send(true) {
-                pf_error!("s"; "error sending to term channel: {}", e);
-            }
-        })?;
-
+    async fn run(
+        &mut self,
+        mut rx_term: watch::Receiver<bool>,
+    ) -> Result<bool, SummersetError> {
         // TODO: proper leader election
         if self.id == 0 {
             self.is_leader = true;
@@ -1186,19 +1231,34 @@ impl GenericReplica for RSPaxosReplica {
                         continue;
                     }
                     let ctrl_msg = ctrl_msg.unwrap();
-                    if let Err(e) = self.handle_ctrl_msg(ctrl_msg) {
-                        pf_error!(self.id; "error handling ctrl msg: {}", e);
+                    match self.handle_ctrl_msg(ctrl_msg).await {
+                        Ok(terminate) => {
+                            if let Some(restart) = terminate {
+                                pf_warn!(
+                                    self.id;
+                                    "server got {} req",
+                                    if restart { "restart" } else { "shutdown" });
+                                return Ok(restart);
+                            }
+                        },
+                        Err(e) => {
+                            pf_error!(self.id; "error handling ctrl msg: {}", e);
+                        }
                     }
                 },
 
                 // receiving termination signal
-                _ = rx_term.recv() => {
+                _ = rx_term.changed() => {
                     pf_warn!(self.id; "server caught termination signal");
                     return Ok(false);
                 }
             }
         }
     }
+
+    fn id(&self) -> ReplicaId {
+        self.id
+    }
 }
 
 /// Configuration parameters struct.
@@ -1220,9 +1280,6 @@ pub struct RSPaxosClient {
     /// Client ID.
     id: ClientId,
 
-    /// Address of the cluster manager oracle.
-    manager: SocketAddr,
-
     /// Configuration parameters struct.
     _config: ClientConfigRSPaxos,
 
@@ -1233,7 +1290,7 @@ pub struct RSPaxosClient {
     server_id: ReplicaId,
 
     /// Control API stub to the cluster manager.
-    ctrl_stub: Option<ClientCtrlStub>,
+    ctrl_stub: ClientCtrlStub,
 
     /// API stubs for communicating with servers.
     api_stub: Option<ClientApiStub>,
@@ -1241,47 +1298,43 @@ pub struct RSPaxosClient {
 
 #[async_trait]
 impl GenericEndpoint for RSPaxosClient {
-    fn new(
+    async fn new_and_setup(
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
+        // connect to the cluster manager and get assigned a client ID
+        let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
+        let id = ctrl_stub.id;
+
+        // parse protocol-specific configs
         let config = parsed_config!(config_str => ClientConfigRSPaxos;
                                     init_server_id)?;
         let init_server_id = config.init_server_id;
 
         Ok(RSPaxosClient {
-            id: 255, // nil at this time
-            manager,
+            id,
             _config: config,
             servers: HashMap::new(),
             server_id: init_server_id,
-            ctrl_stub: None,
+            ctrl_stub,
             api_stub: None,
         })
     }
 
-    async fn connect(&mut self) -> Result<ClientId, SummersetError> {
+    async fn connect(&mut self) -> Result<(), SummersetError> {
         // disallow reconnection without leaving
         if self.api_stub.is_some() {
             return logged_err!(self.id; "reconnecting without leaving");
         }
 
-        // if ctrl_stubs not established yet, connect to the manager
-        if self.ctrl_stub.is_none() {
-            let ctrl_stub =
-                ClientCtrlStub::new_by_connect(self.manager).await?;
-            self.id = ctrl_stub.id;
-            self.ctrl_stub = Some(ctrl_stub);
-        }
-        let ctrl_stub = self.ctrl_stub.as_mut().unwrap();
-
         // ask the manager about the list of active servers
-        let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
+        let mut sent =
+            self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
         while !sent {
-            sent = ctrl_stub.send_req(None)?;
+            sent = self.ctrl_stub.send_req(None)?;
         }
 
-        let reply = ctrl_stub.recv_reply().await?;
+        let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
             CtrlReply::QueryInfo { servers } => {
                 // connect to the one with server ID in config
@@ -1292,7 +1345,7 @@ impl GenericEndpoint for RSPaxosClient {
                 .await?;
                 self.api_stub = Some(api_stub);
                 self.servers = servers;
-                Ok(self.id)
+                Ok(())
             }
             _ => logged_err!(self.id; "unexpected reply type received"),
         }
@@ -1320,26 +1373,19 @@ impl GenericEndpoint for RSPaxosClient {
 
         // if permanently leaving, send leave notification to the manager
         if permanent {
-            // disallow multiple permanent leaving
-            if self.ctrl_stub.is_none() {
-                return logged_err!(self.id; "repeated permanent leaving");
+            let mut sent =
+                self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
+            while !sent {
+                sent = self.ctrl_stub.send_req(None)?;
             }
 
-            if let Some(mut ctrl_stub) = self.ctrl_stub.take() {
-                let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
-                while !sent {
-                    sent = ctrl_stub.send_req(None)?;
+            let reply = self.ctrl_stub.recv_reply().await?;
+            match reply {
+                CtrlReply::Leave => {
+                    pf_info!(self.id; "left current manager connection");
                 }
-
-                let reply = ctrl_stub.recv_reply().await?;
-                match reply {
-                    CtrlReply::Leave => {
-                        pf_info!(self.id; "left current manager connection");
-                        ctrl_stub.forget();
-                    }
-                    _ => {
-                        return logged_err!(self.id; "unexpected reply type received");
-                    }
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
                 }
             }
         }
@@ -1385,4 +1431,12 @@ impl GenericEndpoint for RSPaxosClient {
             None => logged_err!(self.id; "client is not set up"),
         }
     }
+
+    fn id(&self) -> ClientId {
+        self.id
+    }
+
+    fn ctrl_stub(&mut self) -> &mut ClientCtrlStub {
+        &mut self.ctrl_stub
+    }
 }
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index b0156ad3..7d9aa763 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -22,7 +22,7 @@ use async_trait::async_trait;
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
-use tokio::sync::mpsc;
+use tokio::sync::watch;
 
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
@@ -383,11 +383,61 @@ impl SimplePushReplica {
         Ok(())
     }
 
-    /// Synthesized handler of manager control messages.
-    fn handle_ctrl_msg(&mut self, _msg: CtrlMsg) -> Result<(), SummersetError> {
-        // TODO: fill this when more control message types added
+    /// Handler of ResetState control message.
+    async fn handle_ctrl_reset_state(
+        &mut self,
+        durable: bool,
+    ) -> Result<(), SummersetError> {
+        // send leave notification to peers and wait for their replies
+        self.transport_hub.leave().await?;
+
+        // send leave notification to manager and wait for its reply
+        self.control_hub.send_ctrl(CtrlMsg::Leave)?;
+        while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {}
+
+        // if `durable` is false, truncate backer file
+        if !durable {
+            // use 0 as a special log action ID here
+            self.storage_hub
+                .submit_action(0, LogAction::Truncate { offset: 0 })?;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id == 0 {
+                    if log_result
+                        != (LogResult::Truncate {
+                            offset_ok: true,
+                            now_size: 0,
+                        })
+                    {
+                        return logged_err!(self.id; "failed to truncate log to 0");
+                    } else {
+                        return Ok(());
+                    }
+                }
+            }
+        }
+
         Ok(())
     }
+
+    /// Synthesized handler of manager control messages. If ok, returns
+    /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
+    /// decides to shutdown completely, and `None` if not terminating.
+    async fn handle_ctrl_msg(
+        &mut self,
+        msg: CtrlMsg,
+    ) -> Result<Option<bool>, SummersetError> {
+        // TODO: fill this when more control message types added
+        match msg {
+            CtrlMsg::ResetState { durable } => {
+                self.handle_ctrl_reset_state(durable).await?;
+                Ok(Some(true))
+            }
+
+            _ => Ok(None), // ignore all other types
+        }
+    }
 }
 
 #[async_trait]
@@ -476,15 +526,10 @@ impl GenericReplica for SimplePushReplica {
         })
     }
 
-    async fn run(&mut self) -> Result<bool, SummersetError> {
-        // set up termination signals handler
-        let (tx_term, mut rx_term) = mpsc::unbounded_channel();
-        ctrlc::set_handler(move || {
-            if let Err(e) = tx_term.send(true) {
-                pf_error!("s"; "error sending to term channel: {}", e);
-            }
-        })?;
-
+    async fn run(
+        &mut self,
+        mut rx_term: watch::Receiver<bool>,
+    ) -> Result<bool, SummersetError> {
         loop {
             tokio::select! {
                 // client request batch
@@ -552,19 +597,34 @@ impl GenericReplica for SimplePushReplica {
                         continue;
                     }
                     let ctrl_msg = ctrl_msg.unwrap();
-                    if let Err(e) = self.handle_ctrl_msg(ctrl_msg) {
-                        pf_error!(self.id; "error handling ctrl msg: {}", e);
+                    match self.handle_ctrl_msg(ctrl_msg).await {
+                        Ok(terminate) => {
+                            if let Some(restart) = terminate {
+                                pf_warn!(
+                                    self.id;
+                                    "server got {} req",
+                                    if restart { "restart" } else { "shutdown" });
+                                return Ok(restart);
+                            }
+                        },
+                        Err(e) => {
+                            pf_error!(self.id; "error handling ctrl msg: {}", e);
+                        }
                     }
                 },
 
                 // receiving termination signal
-                _ = rx_term.recv() => {
+                _ = rx_term.changed() => {
                     pf_warn!(self.id; "server caught termination signal");
                     return Ok(false);
                 }
             }
         }
     }
+
+    fn id(&self) -> ReplicaId {
+        self.id
+    }
 }
 
 /// Configuration parameters struct.
@@ -586,14 +646,11 @@ pub struct SimplePushClient {
     /// Client ID.
     id: ClientId,
 
-    /// Address of the cluster manager oracle.
-    manager: SocketAddr,
-
     /// Configuration parameters struct.
     config: ClientConfigSimplePush,
 
     /// Control API stub to the cluster manager.
-    ctrl_stub: Option<ClientCtrlStub>,
+    ctrl_stub: ClientCtrlStub,
 
     /// API stubs for communicating with servers.
     api_stub: Option<ClientApiStub>,
@@ -601,44 +658,40 @@ pub struct SimplePushClient {
 
 #[async_trait]
 impl GenericEndpoint for SimplePushClient {
-    fn new(
+    async fn new_and_setup(
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
+        // connect to the cluster manager and get assigned a client ID
+        let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
+        let id = ctrl_stub.id;
+
+        // parse protocol-specific configs
         let config = parsed_config!(config_str => ClientConfigSimplePush;
                                     server_id)?;
 
         Ok(SimplePushClient {
-            id: 255, // nil at this time
-            manager,
+            id,
             config,
-            ctrl_stub: None,
+            ctrl_stub,
             api_stub: None,
         })
     }
 
-    async fn connect(&mut self) -> Result<ClientId, SummersetError> {
+    async fn connect(&mut self) -> Result<(), SummersetError> {
         // disallow reconnection without leaving
         if self.api_stub.is_some() {
             return logged_err!(self.id; "reconnecting without leaving");
         }
 
-        // if ctrl_stubs not established yet, connect to the manager
-        if self.ctrl_stub.is_none() {
-            let ctrl_stub =
-                ClientCtrlStub::new_by_connect(self.manager).await?;
-            self.id = ctrl_stub.id;
-            self.ctrl_stub = Some(ctrl_stub);
-        }
-        let ctrl_stub = self.ctrl_stub.as_mut().unwrap();
-
         // ask the manager about the list of active servers
-        let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
+        let mut sent =
+            self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
         while !sent {
-            sent = ctrl_stub.send_req(None)?;
+            sent = self.ctrl_stub.send_req(None)?;
         }
 
-        let reply = ctrl_stub.recv_reply().await?;
+        let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
             CtrlReply::QueryInfo { servers } => {
                 // connect to the one with server ID in config
@@ -648,7 +701,7 @@ impl GenericEndpoint for SimplePushClient {
                 )
                 .await?;
                 self.api_stub = Some(api_stub);
-                Ok(self.id)
+                Ok(())
             }
             _ => logged_err!(self.id; "unexpected reply type received"),
         }
@@ -676,26 +729,19 @@ impl GenericEndpoint for SimplePushClient {
 
         // if permanently leaving, send leave notification to the manager
         if permanent {
-            // disallow multiple permanent leaving
-            if self.ctrl_stub.is_none() {
-                return logged_err!(self.id; "repeated permanent leaving");
+            let mut sent =
+                self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
+            while !sent {
+                sent = self.ctrl_stub.send_req(None)?;
             }
 
-            if let Some(mut ctrl_stub) = self.ctrl_stub.take() {
-                let mut sent = ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
-                while !sent {
-                    sent = ctrl_stub.send_req(None)?;
+            let reply = self.ctrl_stub.recv_reply().await?;
+            match reply {
+                CtrlReply::Leave => {
+                    pf_info!(self.id; "left current manager connection");
                 }
-
-                let reply = ctrl_stub.recv_reply().await?;
-                match reply {
-                    CtrlReply::Leave => {
-                        pf_info!(self.id; "left current manager connection");
-                        ctrl_stub.forget();
-                    }
-                    _ => {
-                        return logged_err!(self.id; "unexpected reply type received");
-                    }
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
                 }
             }
         }
@@ -719,4 +765,12 @@ impl GenericEndpoint for SimplePushClient {
             None => logged_err!(self.id; "client is not set up"),
         }
     }
+
+    fn id(&self) -> ClientId {
+        self.id
+    }
+
+    fn ctrl_stub(&mut self) -> &mut ClientCtrlStub {
+        &mut self.ctrl_stub
+    }
 }
diff --git a/src/server/external.rs b/src/server/external.rs
index 3083a662..c52a946c 100644
--- a/src/server/external.rs
+++ b/src/server/external.rs
@@ -519,6 +519,7 @@ mod external_tests {
             )
             .await?;
             barrier2.wait().await;
+            // recv requests from client
             let mut reqs: Vec<(ClientId, ApiRequest)> = vec![];
             while reqs.len() < 3 {
                 let mut req_batch = api.get_req_batch().await?;
@@ -551,6 +552,7 @@ mod external_tests {
                     cmd: Command::Get { key: "Jose".into() },
                 }
             );
+            // send replies to client
             api.send_reply(
                 ApiReply::Reply {
                     id: 0,
@@ -584,6 +586,7 @@ mod external_tests {
         let mut api_stub =
             ClientApiStub::new_by_connect(2857, "127.0.0.1:53700".parse()?)
                 .await?;
+        // send requests to server
         api_stub.send_req(Some(&ApiRequest::Req {
             id: 0,
             cmd: Command::Put {
@@ -599,6 +602,7 @@ mod external_tests {
             id: 1,
             cmd: Command::Get { key: "Jose".into() },
         }))?;
+        // recv replies from server
         assert_eq!(
             api_stub.recv_reply().await?,
             ApiReply::Reply {
@@ -642,6 +646,7 @@ mod external_tests {
             )
             .await?;
             barrier2.wait().await;
+            // recv request from client
             let mut reqs: Vec<(ClientId, ApiRequest)> = vec![];
             while reqs.is_empty() {
                 let mut req_batch = api.get_req_batch().await?;
@@ -660,6 +665,7 @@ mod external_tests {
                     },
                 }
             );
+            // send reply to client
             api.send_reply(
                 ApiReply::Reply {
                     id: 0,
@@ -668,6 +674,7 @@ mod external_tests {
                 },
                 client,
             )?;
+            // recv request from new client
             reqs.clear();
             while reqs.is_empty() {
                 let mut req_batch = api.get_req_batch().await?;
@@ -687,6 +694,7 @@ mod external_tests {
                     },
                 }
             );
+            // send reply to new client
             api.send_reply(
                 ApiReply::Reply {
                     id: 0,
@@ -704,6 +712,7 @@ mod external_tests {
         let mut api_stub =
             ClientApiStub::new_by_connect(2857, "127.0.0.1:54700".parse()?)
                 .await?;
+        // send request to server
         api_stub.send_req(Some(&ApiRequest::Req {
             id: 0,
             cmd: Command::Put {
@@ -711,6 +720,7 @@ mod external_tests {
                 value: "123".into(),
             },
         }))?;
+        // recv reply from server
         assert_eq!(
             api_stub.recv_reply().await?,
             ApiReply::Reply {
@@ -719,13 +729,15 @@ mod external_tests {
                 redirect: None,
             }
         );
+        // leave and come back as new client
         api_stub.send_req(Some(&ApiRequest::Leave))?;
         assert_eq!(api_stub.recv_reply().await?, ApiReply::Leave);
         api_stub.forget();
-        time::sleep(Duration::from_millis(1)).await;
+        time::sleep(Duration::from_millis(100)).await;
         let mut api_stub =
             ClientApiStub::new_by_connect(2858, "127.0.0.1:54700".parse()?)
                 .await?;
+        // send request to server
         api_stub.send_req(Some(&ApiRequest::Req {
             id: 0,
             cmd: Command::Put {
@@ -733,6 +745,7 @@ mod external_tests {
                 value: "456".into(),
             },
         }))?;
+        // recv reply from server
         assert_eq!(
             api_stub.recv_reply().await?,
             ApiReply::Reply {
diff --git a/src/server/replica.rs b/src/server/replica.rs
index cae9d042..6c305ad3 100644
--- a/src/server/replica.rs
+++ b/src/server/replica.rs
@@ -7,6 +7,8 @@ use crate::utils::SummersetError;
 
 use async_trait::async_trait;
 
+use tokio::sync::watch;
+
 /// Server replica ID type.
 pub type ReplicaId = u8;
 
@@ -28,5 +30,11 @@ pub trait GenericReplica {
     /// terminated normally and wants to restart (e.g., receiving a reset
     /// control message) or `Ok(false)` if terminated normally and does not
     /// want to restart (e.g., receiving a termination signal).
-    async fn run(&mut self) -> Result<bool, SummersetError>;
+    async fn run(
+        &mut self,
+        rx_term: watch::Receiver<bool>, // termination signals channel
+    ) -> Result<bool, SummersetError>;
+
+    /// Gets my replica ID.
+    fn id(&self) -> ReplicaId;
 }
diff --git a/src/server/transport.rs b/src/server/transport.rs
index 8f5f69cf..e7ca2998 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -11,7 +11,7 @@ use crate::server::ReplicaId;
 
 use bytes::BytesMut;
 
-use serde::{Serialize, de::DeserializeOwned};
+use serde::{Serialize, Deserialize, de::DeserializeOwned};
 
 use tokio::net::{TcpListener, TcpStream};
 use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf};
@@ -20,6 +20,19 @@ use tokio::sync::mpsc;
 use tokio::task::JoinHandle;
 use tokio::time::{self, Duration};
 
+/// Peer-peer message wrapper type that includes leave notification variants.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+enum PeerMessage<Msg> {
+    /// Normal protocol-specific request.
+    Msg { msg: Msg },
+
+    /// Server leave notification.
+    Leave,
+
+    /// Reply to leave notification.
+    LeaveReply,
+}
+
 /// Server internal TCP transport module.
 pub struct TransportHub<Msg> {
     /// My replica ID.
@@ -29,11 +42,14 @@ pub struct TransportHub<Msg> {
     population: u8,
 
     /// Receiver side of the recv channel.
-    rx_recv: mpsc::UnboundedReceiver<(ReplicaId, Msg)>,
+    rx_recv: mpsc::UnboundedReceiver<(ReplicaId, PeerMessage<Msg>)>,
 
     /// Map from peer ID -> sender side of the send channel, shared with the
     /// peer acceptor thread.
-    tx_sends: flashmap::ReadHandle<ReplicaId, mpsc::UnboundedSender<Msg>>,
+    tx_sends: flashmap::ReadHandle<
+        ReplicaId,
+        mpsc::UnboundedSender<PeerMessage<Msg>>,
+    >,
 
     /// Join handle of the peer acceptor thread.
     _peer_acceptor_handle: JoinHandle<()>,
@@ -76,8 +92,10 @@ where
 
         let (tx_recv, rx_recv) = mpsc::unbounded_channel();
 
-        let (tx_sends_write, tx_sends_read) =
-            flashmap::new::<ReplicaId, mpsc::UnboundedSender<Msg>>();
+        let (tx_sends_write, tx_sends_read) = flashmap::new::<
+            ReplicaId,
+            mpsc::UnboundedSender<PeerMessage<Msg>>,
+        >();
 
         let (peer_messenger_handles_write, peer_messenger_handles_read) =
             flashmap::new::<ReplicaId, JoinHandle<()>>();
@@ -141,7 +159,7 @@ where
             logged_err!(self.me; "invalid group size {}", group)
         } else {
             while self.current_peers()?.count() + 1 < group {
-                time::sleep(Duration::from_millis(10)).await;
+                time::sleep(Duration::from_millis(100)).await;
             }
             Ok(())
         }
@@ -170,7 +188,7 @@ where
         match tx_sends_guard.get(&peer) {
             Some(tx_send) => {
                 tx_send
-                    .send(msg)
+                    .send(PeerMessage::Msg { msg })
                     .map_err(|e| SummersetError(e.to_string()))?;
             }
             None => {
@@ -207,7 +225,7 @@ where
             tx_sends_guard
                 .get(&peer)
                 .unwrap()
-                .send(msg.clone())
+                .send(PeerMessage::Msg { msg: msg.clone() })
                 .map_err(|e| SummersetError(e.to_string()))?;
         }
 
@@ -220,10 +238,47 @@ where
         &mut self,
     ) -> Result<(ReplicaId, Msg), SummersetError> {
         match self.rx_recv.recv().await {
-            Some((id, msg)) => Ok((id, msg)),
+            Some((id, peer_msg)) => match peer_msg {
+                PeerMessage::Msg { msg } => Ok((id, msg)),
+                _ => logged_err!(self.me; "unexpected peer message type"),
+            },
             None => logged_err!(self.me; "recv channel has been closed"),
         }
     }
+
+    /// Broadcasts leave notifications to all peers and waits for replies.
+    pub async fn leave(&mut self) -> Result<(), SummersetError> {
+        let tx_sends_guard = self.tx_sends.guard();
+        let mut num_peers = 0;
+        for &peer in tx_sends_guard.keys() {
+            if peer == self.me {
+                continue;
+            }
+
+            // not skipped
+            tx_sends_guard
+                .get(&peer)
+                .unwrap()
+                .send(PeerMessage::Leave)
+                .map_err(|e| SummersetError(e.to_string()))?;
+            num_peers += 1;
+        }
+
+        let mut replies = Bitmap::new(self.population, false);
+        while replies.count() < num_peers {
+            match self.rx_recv.recv().await {
+                Some((id, peer_msg)) => match peer_msg {
+                    PeerMessage::LeaveReply => replies.set(id, true)?,
+                    _ => continue, // ignore all other types of messages
+                },
+                None => {
+                    return logged_err!(self.me; "recv channel has been closed");
+                }
+            }
+        }
+
+        Ok(())
+    }
 }
 
 // TransportHub peer_acceptor thread implementation
@@ -242,10 +297,10 @@ where
         me: ReplicaId,
         id: ReplicaId,
         addr: SocketAddr,
-        tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>,
+        tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage<Msg>)>,
         tx_sends: &mut flashmap::WriteHandle<
             ReplicaId,
-            mpsc::UnboundedSender<Msg>,
+            mpsc::UnboundedSender<PeerMessage<Msg>>,
         >,
         peer_messenger_handles: &mut flashmap::WriteHandle<
             ReplicaId,
@@ -280,10 +335,10 @@ where
         me: ReplicaId,
         mut stream: TcpStream,
         addr: SocketAddr,
-        tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>,
+        tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage<Msg>)>,
         tx_sends: &mut flashmap::WriteHandle<
             ReplicaId,
-            mpsc::UnboundedSender<Msg>,
+            mpsc::UnboundedSender<PeerMessage<Msg>>,
         >,
         peer_messenger_handles: &mut flashmap::WriteHandle<
             ReplicaId,
@@ -321,7 +376,7 @@ where
         id: ReplicaId,
         tx_sends: &mut flashmap::WriteHandle<
             ReplicaId,
-            mpsc::UnboundedSender<Msg>,
+            mpsc::UnboundedSender<PeerMessage<Msg>>,
         >,
         peer_messenger_handles: &mut flashmap::WriteHandle<
             ReplicaId,
@@ -343,11 +398,11 @@ where
     /// Peer acceptor thread function.
     async fn peer_acceptor_thread(
         me: ReplicaId,
-        tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>,
+        tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage<Msg>)>,
         peer_listener: TcpListener,
         mut tx_sends: flashmap::WriteHandle<
             ReplicaId,
-            mpsc::UnboundedSender<Msg>,
+            mpsc::UnboundedSender<PeerMessage<Msg>>,
         >,
         mut peer_messenger_handles: flashmap::WriteHandle<
             ReplicaId,
@@ -444,7 +499,7 @@ where
         write_buf: &mut BytesMut,
         write_buf_cursor: &mut usize,
         conn_write: &OwnedWriteHalf,
-        msg: Option<&Msg>,
+        msg: Option<&PeerMessage<Msg>>,
     ) -> Result<bool, SummersetError> {
         safe_tcp_write(write_buf, write_buf_cursor, conn_write, msg)
     }
@@ -455,7 +510,7 @@ where
         // message itself
         read_buf: &mut BytesMut,
         conn_read: &mut OwnedReadHalf,
-    ) -> Result<Msg, SummersetError> {
+    ) -> Result<PeerMessage<Msg>, SummersetError> {
         safe_tcp_read(read_buf, conn_read).await
     }
 
@@ -465,8 +520,8 @@ where
         id: ReplicaId,    // corresonding peer's ID
         addr: SocketAddr, // corresponding peer's address
         conn: TcpStream,
-        mut rx_send: mpsc::UnboundedReceiver<Msg>,
-        tx_recv: mpsc::UnboundedSender<(ReplicaId, Msg)>,
+        mut rx_send: mpsc::UnboundedReceiver<PeerMessage<Msg>>,
+        tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage<Msg>)>,
         tx_exit: mpsc::UnboundedSender<ReplicaId>,
     ) {
         pf_debug!(me; "peer_messenger thread for {} ({}) spawned", id, addr);
@@ -482,12 +537,32 @@ where
                 // gets a message to send out
                 msg = rx_send.recv(), if !retrying => {
                     match msg {
-                        Some(msg) => {
+                        Some(PeerMessage::Leave) => {
+                            // I decide to leave, notify peers
+                            let peer_msg = PeerMessage::Leave;
+                            if let Err(e) = Self::write_msg(
+                                &mut write_buf,
+                                &mut write_buf_cursor,
+                                &conn_write,
+                                Some(&peer_msg),
+                            ) {
+                                pf_error!(me; "error sending -> {}: {}", id, e);
+                            } else { // skips `WouldBlock` failure check here
+                                pf_debug!(me; "sent leave notification -> {}", id);
+                            }
+                        },
+
+                        Some(PeerMessage::LeaveReply) => {
+                            pf_error!(me; "proactively sending LeaveReply msg");
+                        },
+
+                        Some(PeerMessage::Msg { msg }) => {
+                            let peer_msg = PeerMessage::Msg { msg };
                             match Self::write_msg(
                                 &mut write_buf,
                                 &mut write_buf_cursor,
                                 &conn_write,
-                                Some(&msg),
+                                Some(&peer_msg),
                             ) {
                                 Ok(true) => {
                                     // pf_trace!(me; "sent -> {} msg {:?}", id, msg);
@@ -501,6 +576,7 @@ where
                                 }
                             }
                         },
+
                         None => break, // channel gets closed and no messages remain
                     }
                 },
@@ -508,9 +584,35 @@ where
                 // receives new message from peer
                 msg = Self::read_msg(&mut read_buf, &mut conn_read) => {
                     match msg {
-                        Ok(msg) => {
+                        Ok(PeerMessage::Leave) => {
+                            // peer leaving, send dummy reply and break
+                            let peer_msg = PeerMessage::LeaveReply;
+                            if let Err(e) = Self::write_msg(
+                                &mut write_buf,
+                                &mut write_buf_cursor,
+                                &conn_write,
+                                Some(&peer_msg),
+                            ) {
+                                pf_error!(me; "error sending -> {}: {}", id, e);
+                            } else { // skips `WouldBlock` failure check here
+                                pf_debug!(me; "peer {} has left", id);
+                            }
+                            break;
+                        },
+
+                        Ok(PeerMessage::LeaveReply) => {
+                            // my leave notification is acked by peer, break
+                            let peer_msg = PeerMessage::LeaveReply;
+                            if let Err(e) = tx_recv.send((id, peer_msg)) {
+                                pf_error!(me; "error sending to tx_recv for {}: {}", id, e);
+                            }
+                            break;
+                        }
+
+                        Ok(PeerMessage::Msg { msg }) => {
                             // pf_trace!(me; "recv <- {} msg {:?}", id, msg);
-                            if let Err(e) = tx_recv.send((id, msg)) {
+                            let peer_msg = PeerMessage::Msg { msg };
+                            if let Err(e) = tx_recv.send((id, peer_msg)) {
                                 pf_error!(me; "error sending to tx_recv for {}: {}", id, e);
                             }
                         },
@@ -570,53 +672,53 @@ mod transport_tests {
         tokio::spawn(async move {
             // replica 1
             let mut hub: TransportHub<TestMsg> =
-                TransportHub::new_and_setup(1, 3, "127.0.0.1:54801".parse()?)
+                TransportHub::new_and_setup(1, 3, "127.0.0.1:53801".parse()?)
                     .await?;
             barrier1.wait().await;
-            hub.connect_to_peer(2, "127.0.0.1:54802".parse()?).await?;
+            hub.connect_to_peer(2, "127.0.0.1:53802".parse()?).await?;
             // recv a message from 0
             let (id, msg) = hub.recv_msg().await?;
-            assert!(id == 0);
+            assert_eq!(id, 0);
             assert_eq!(msg, TestMsg("hello".into()));
             // send a message to 0
             hub.send_msg(TestMsg("world".into()), 0)?;
             // recv another message from 0
             let (id, msg) = hub.recv_msg().await?;
-            assert!(id == 0);
+            assert_eq!(id, 0);
             assert_eq!(msg, TestMsg("nice".into()));
             // send another message to 0
             hub.send_msg(TestMsg("job!".into()), 0)?;
             // wait for termination message
             let (id, msg) = hub.recv_msg().await?;
-            assert!(id == 0);
+            assert_eq!(id, 0);
             assert_eq!(msg, TestMsg("terminate".into()));
             Ok::<(), SummersetError>(())
         });
         tokio::spawn(async move {
             // replica 2
             let mut hub: TransportHub<TestMsg> =
-                TransportHub::new_and_setup(2, 3, "127.0.0.1:54802".parse()?)
+                TransportHub::new_and_setup(2, 3, "127.0.0.1:53802".parse()?)
                     .await?;
             barrier2.wait().await;
             // recv a message from 0
             let (id, msg) = hub.recv_msg().await?;
-            assert!(id == 0);
+            assert_eq!(id, 0);
             assert_eq!(msg, TestMsg("hello".into()));
             // send a message to 0
             hub.send_msg(TestMsg("world".into()), 0)?;
             // wait for termination message
             let (id, msg) = hub.recv_msg().await?;
-            assert!(id == 0);
+            assert_eq!(id, 0);
             assert_eq!(msg, TestMsg("terminate".into()));
             Ok::<(), SummersetError>(())
         });
         // replica 0
         let mut hub: TransportHub<TestMsg> =
-            TransportHub::new_and_setup(0, 3, "127.0.0.1:54800".parse()?)
+            TransportHub::new_and_setup(0, 3, "127.0.0.1:53800".parse()?)
                 .await?;
         barrier.wait().await;
-        hub.connect_to_peer(1, "127.0.0.1:54801".parse()?).await?;
-        hub.connect_to_peer(2, "127.0.0.1:54802".parse()?).await?;
+        hub.connect_to_peer(1, "127.0.0.1:53801".parse()?).await?;
+        hub.connect_to_peer(2, "127.0.0.1:53802".parse()?).await?;
         // send a message to 1 and 2
         hub.bcast_msg(TestMsg("hello".into()), None)?;
         // recv a message from both 1 and 2
@@ -638,4 +740,49 @@ mod transport_tests {
         hub.bcast_msg(TestMsg("terminate".into()), None)?;
         Ok(())
     }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn api_server_leave() -> Result<(), SummersetError> {
+        let barrier = Arc::new(Barrier::new(2));
+        let barrier2 = barrier.clone();
+        tokio::spawn(async move {
+            // replica 1/2
+            let mut hub: TransportHub<TestMsg> =
+                TransportHub::new_and_setup(1, 3, "127.0.0.1:54801".parse()?)
+                    .await?;
+            barrier2.wait().await;
+            // recv a message from 0
+            let (id, msg) = hub.recv_msg().await?;
+            assert_eq!(id, 0);
+            assert!(hub.current_peers()?.get(id)?);
+            assert_eq!(msg, TestMsg("goodbye".into()));
+            // leave and come back as 2
+            hub.leave().await?;
+            time::sleep(Duration::from_millis(100)).await;
+            let mut hub: TransportHub<TestMsg> =
+                TransportHub::new_and_setup(2, 3, "127.0.0.1:54802".parse()?)
+                    .await?;
+            hub.connect_to_peer(0, "127.0.0.1:54800".parse()?).await?;
+            // send a message to 0
+            hub.send_msg(TestMsg("hello".into()), 0)?;
+            Ok::<(), SummersetError>(())
+        });
+        // replica 0
+        let mut hub: TransportHub<TestMsg> =
+            TransportHub::new_and_setup(0, 3, "127.0.0.1:54800".parse()?)
+                .await?;
+        barrier.wait().await;
+        hub.connect_to_peer(1, "127.0.0.1:54801".parse()?).await?;
+        assert!(hub.current_peers()?.get(1)?);
+        assert!(!hub.current_peers()?.get(2)?);
+        // send a message to 1
+        hub.send_msg(TestMsg("goodbye".into()), 1)?;
+        // recv a message from 2
+        let (id, msg) = hub.recv_msg().await?;
+        assert_eq!(id, 2);
+        assert_eq!(msg, TestMsg("hello".into()));
+        assert!(!hub.current_peers()?.get(1)?);
+        assert!(hub.current_peers()?.get(2)?);
+        Ok(())
+    }
 }
diff --git a/summerset_client/Cargo.toml b/summerset_client/Cargo.toml
index 9633986c..bf270e74 100644
--- a/summerset_client/Cargo.toml
+++ b/summerset_client/Cargo.toml
@@ -7,7 +7,7 @@ authors = ["Guanzhou Jose Hu <huguanzhou123@gmail.com>"]
 
 [dependencies]
 summerset = { path = "../" }
-tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] }
+tokio = { version = "1.29", features = ["full"] }
 rand = "0.8"
 lazy_static = "1.4"
 clap = { version = "4.0", features = ["derive"] }
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index 98743091..9ad2c3d6 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -18,8 +18,8 @@ use serde::Deserialize;
 use tokio::time::Duration;
 
 use summerset::{
-    GenericEndpoint, CommandResult, RequestId, SummersetError, pf_error,
-    logged_err, parsed_config,
+    GenericEndpoint, CommandResult, RequestId, CtrlRequest, CtrlReply,
+    SummersetError, pf_error, logged_err, parsed_config,
 };
 
 lazy_static! {
@@ -215,13 +215,35 @@ impl ClientTester {
         }
     }
 
+    /// Resets all servers in the cluster to initial empty state.
+    async fn reset_cluster(&mut self) -> Result<(), SummersetError> {
+        let ctrl_stub = self.driver.ctrl_stub();
+
+        // send ResetServer request to manager
+        let req = CtrlRequest::ResetServer {
+            server: None,
+            durable: false,
+        };
+        let mut sent = ctrl_stub.send_req(Some(&req))?;
+        while !sent {
+            sent = ctrl_stub.send_req(None)?;
+        }
+
+        // wait for reply from manager
+        let reply = ctrl_stub.recv_reply().await?;
+        match reply {
+            CtrlReply::ResetServer { .. } => Ok(()),
+            _ => logged_err!("c"; "unexpected control reply type"),
+        }
+    }
+
     /// Runs the individual correctness test.
     async fn do_test_by_name(
         &mut self,
         name: &str,
     ) -> Result<(), SummersetError> {
         // reset everything to initial state at the start of each test
-        // TODO: reset service state here
+        self.reset_cluster().await?;
         self.driver.connect().await?;
         self.cached_replies.clear();
 
diff --git a/summerset_client/src/drivers/closed_loop.rs b/summerset_client/src/drivers/closed_loop.rs
index c3b1761c..cb361cf5 100644
--- a/summerset_client/src/drivers/closed_loop.rs
+++ b/summerset_client/src/drivers/closed_loop.rs
@@ -4,13 +4,14 @@ use tokio::time::{Duration, Instant};
 
 use summerset::{
     GenericEndpoint, ClientId, Command, CommandResult, ApiRequest, ApiReply,
-    RequestId, Timer, SummersetError, pf_debug, pf_error, logged_err,
+    RequestId, ClientCtrlStub, Timer, SummersetError, pf_debug, pf_error,
+    logged_err,
 };
 
 /// Closed-loop driver struct.
 pub struct DriverClosedLoop {
     /// Client ID.
-    id: ClientId,
+    pub id: ClientId,
 
     /// Protocol-specific client endpoint.
     endpoint: Box<dyn GenericEndpoint>,
@@ -29,7 +30,7 @@ impl DriverClosedLoop {
     /// Creates a new closed-loop client.
     pub fn new(endpoint: Box<dyn GenericEndpoint>, timeout: Duration) -> Self {
         DriverClosedLoop {
-            id: 255, // nil at this time
+            id: endpoint.id(),
             endpoint,
             next_req: 0,
             timer: Timer::new(),
@@ -39,9 +40,7 @@ impl DriverClosedLoop {
 
     /// Establishes connection with the service.
     pub async fn connect(&mut self) -> Result<(), SummersetError> {
-        let id = self.endpoint.connect().await?;
-        self.id = id;
-        Ok(())
+        self.endpoint.connect().await
     }
 
     /// Sends leave notification and forgets about the current TCP connections.
@@ -185,4 +184,10 @@ impl DriverClosedLoop {
             _ => logged_err!(self.id; "unexpected reply type received"),
         }
     }
+
+    /// Gets a mutable reference to the endpoint's control stub.
+    #[allow(dead_code)]
+    pub fn ctrl_stub(&mut self) -> &mut ClientCtrlStub {
+        self.endpoint.ctrl_stub()
+    }
 }
diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs
index d07c4351..433f68a2 100644
--- a/summerset_client/src/drivers/open_loop.rs
+++ b/summerset_client/src/drivers/open_loop.rs
@@ -11,13 +11,14 @@ use tokio::time::{Duration, Instant};
 
 use summerset::{
     GenericEndpoint, ClientId, Command, CommandResult, ApiRequest, ApiReply,
-    RequestId, Timer, SummersetError, pf_debug, pf_error, logged_err,
+    RequestId, ClientCtrlStub, Timer, SummersetError, pf_debug, pf_error,
+    logged_err,
 };
 
 /// Open-loop driver struct.
 pub struct DriverOpenLoop {
     /// Client ID.
-    id: ClientId,
+    pub id: ClientId,
 
     /// Protocol-specific client endpoint.
     endpoint: Box<dyn GenericEndpoint>,
@@ -43,7 +44,7 @@ impl DriverOpenLoop {
     /// Creates a new open-loop client.
     pub fn new(endpoint: Box<dyn GenericEndpoint>, timeout: Duration) -> Self {
         DriverOpenLoop {
-            id: 255, // nil at this time
+            id: endpoint.id(),
             endpoint,
             next_req: 0,
             pending_reqs: HashMap::new(),
@@ -55,9 +56,7 @@ impl DriverOpenLoop {
 
     /// Establishes connection with the service.
     pub async fn connect(&mut self) -> Result<(), SummersetError> {
-        let id = self.endpoint.connect().await?;
-        self.id = id;
-        Ok(())
+        self.endpoint.connect().await
     }
 
     /// Waits for all pending replies to be received, then sends leave
@@ -211,4 +210,9 @@ impl DriverOpenLoop {
             _ => logged_err!(self.id; "unexpected reply type received"),
         }
     }
+
+    /// Gets a mutable reference to the endpoint's control stub.
+    pub fn ctrl_stub(&mut self) -> &mut ClientCtrlStub {
+        self.endpoint.ctrl_stub()
+    }
 }
diff --git a/summerset_client/src/main.rs b/summerset_client/src/main.rs
index 26346720..81fbcdba 100644
--- a/summerset_client/src/main.rs
+++ b/summerset_client/src/main.rs
@@ -113,8 +113,9 @@ fn client_main() -> Result<(), SummersetError> {
 
     // enter tokio runtime, connect to the service, and do work
     runtime.block_on(async move {
-        let endpoint =
-            protocol.new_client_endpoint(args.manager, config_str)?;
+        let endpoint = protocol
+            .new_client_endpoint(args.manager, config_str)
+            .await?;
 
         match mode {
             ClientMode::Repl => {
diff --git a/summerset_manager/Cargo.toml b/summerset_manager/Cargo.toml
index f0464aa8..f2920305 100644
--- a/summerset_manager/Cargo.toml
+++ b/summerset_manager/Cargo.toml
@@ -7,8 +7,9 @@ authors = ["Guanzhou Jose Hu <huguanzhou123@gmail.com>"]
 
 [dependencies]
 summerset = { path = "../" }
-tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] }
+tokio = { version = "1.29", features = ["full"] }
 rand = "0.8"
 clap = { version = "4.0", features = ["derive"] }
 log = "0.4"
 env_logger = "0.10"
+ctrlc = { version = "3.4", features = ["termination"] }
diff --git a/summerset_manager/src/main.rs b/summerset_manager/src/main.rs
index 6b886372..9a08319e 100644
--- a/summerset_manager/src/main.rs
+++ b/summerset_manager/src/main.rs
@@ -5,11 +5,14 @@ use std::process::ExitCode;
 
 use clap::Parser;
 
+use log::{self, LevelFilter};
+
 use env_logger::Env;
 
 use tokio::runtime::Builder;
+use tokio::sync::watch;
 
-use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error};
+use summerset::{SmrProtocol, SummersetError, pf_error};
 
 /// Command line arguments definition.
 #[derive(Parser, Debug)]
@@ -99,24 +102,43 @@ fn manager_main() -> Result<(), SummersetError> {
         ))
     })?;
 
-    // create tokio multi-threaded runtime
-    let runtime = Builder::new_multi_thread()
-        .enable_all()
-        .worker_threads(args.threads)
-        .thread_name("tokio-worker-manager")
-        .build()?;
+    // set up termination signals handler
+    let (tx_term, rx_term) = watch::channel(false);
+    ctrlc::set_handler(move || {
+        if let Err(e) = tx_term.send(true) {
+            pf_error!("m"; "error sending to term channel: {}", e);
+        }
+    })?;
+
+    let log_level = log::max_level();
 
-    // enter tokio runtime, setup the cluster manager, and start the main
-    // event loop logic
-    runtime.block_on(async move {
-        let mut manager = protocol
-            .new_cluster_manager_setup(srv_addr, cli_addr, args.population)
-            .await?;
+    {
+        // create tokio multi-threaded runtime
+        let runtime = Builder::new_multi_thread()
+            .enable_all()
+            .worker_threads(args.threads)
+            .thread_name("tokio-worker-manager")
+            .build()?;
 
-        manager.run().await?;
+        // enter tokio runtime, setup the cluster manager, and start the main
+        // event loop logic
+        runtime.block_on(async move {
+            let mut manager = protocol
+                .new_cluster_manager_setup(srv_addr, cli_addr, args.population)
+                .await?;
+
+            manager.run(rx_term).await?;
+
+            // suppress logging before dropping the runtime to avoid spurious
+            // error messages
+            log::set_max_level(LevelFilter::Off);
+
+            Ok::<(), SummersetError>(()) // give type hint for this async closure
+        })?;
+    }
 
-        Ok::<(), SummersetError>(()) // give type hint for this async closure
-    })
+    log::set_max_level(log_level);
+    Ok(())
 }
 
 fn main() -> ExitCode {
@@ -130,7 +152,7 @@ fn main() -> ExitCode {
         pf_error!("m"; "manager_main exitted: {}", e);
         ExitCode::FAILURE
     } else {
-        pf_warn!("m"; "manager_main exitted successfully");
+        // pf_warn!("m"; "manager_main exitted successfully");
         ExitCode::SUCCESS
     }
 }
diff --git a/summerset_server/Cargo.toml b/summerset_server/Cargo.toml
index 0a8ad28b..3058e797 100644
--- a/summerset_server/Cargo.toml
+++ b/summerset_server/Cargo.toml
@@ -7,8 +7,9 @@ authors = ["Guanzhou Jose Hu <huguanzhou123@gmail.com>"]
 
 [dependencies]
 summerset = { path = "../" }
-tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] }
+tokio = { version = "1.29", features = ["full"] }
 rand = "0.8"
 clap = { version = "4.0", features = ["derive"] }
 log = "0.4"
 env_logger = "0.10"
+ctrlc = { version = "3.4", features = ["termination"] }
diff --git a/summerset_server/src/main.rs b/summerset_server/src/main.rs
index abbbc20d..800ae9e1 100644
--- a/summerset_server/src/main.rs
+++ b/summerset_server/src/main.rs
@@ -7,11 +7,14 @@ use std::sync::atomic::{AtomicBool, Ordering};
 
 use clap::Parser;
 
+use log::{self, LevelFilter};
+
 use env_logger::Env;
 
 use tokio::runtime::Builder;
+use tokio::sync::watch;
 
-use summerset::{SmrProtocol, SummersetError, pf_warn, pf_error};
+use summerset::{SmrProtocol, SummersetError, pf_error};
 
 /// Command line arguments definition.
 #[derive(Parser, Debug)]
@@ -109,9 +112,21 @@ fn server_main() -> Result<(), SummersetError> {
         Some(&args.config[..])
     };
 
+    // set up termination signals handler
+    let (tx_term, rx_term) = watch::channel(false);
+    ctrlc::set_handler(move || {
+        if let Err(e) = tx_term.send(true) {
+            pf_error!("s"; "error sending to term channel: {}", e);
+        }
+    })?;
+
+    let log_level = log::max_level();
     let shutdown = Arc::new(AtomicBool::new(false));
+
     while !shutdown.load(Ordering::SeqCst) {
-        let sd = shutdown.clone();
+        log::set_max_level(log_level);
+        let shutdown_clone = shutdown.clone();
+        let rx_term_clone = rx_term.clone();
 
         // create tokio multi-threaded runtime
         let runtime = Builder::new_multi_thread()
@@ -132,20 +147,25 @@ fn server_main() -> Result<(), SummersetError> {
                 )
                 .await?;
 
-            if replica.run().await? {
+            if replica.run(rx_term_clone).await? {
                 // event loop terminated but wants to restart (e.g., when
                 // receiving a reset control message); just drop this runtime
                 // and move to the next iteration of loop
             } else {
                 // event loop terminated and does not want to restart (e.g.,
                 // when receiving a termination signal)
-                sd.store(true, Ordering::SeqCst);
+                shutdown_clone.store(true, Ordering::SeqCst);
             }
 
+            // suppress logging before dropping the runtime to avoid spurious
+            // error messages
+            log::set_max_level(LevelFilter::Off);
+
             Ok::<(), SummersetError>(()) // give type hint for this async closure
         })?;
     }
 
+    log::set_max_level(log_level);
     Ok(())
 }
 
@@ -160,7 +180,7 @@ fn main() -> ExitCode {
         pf_error!("s"; "server_main exitted: {}", e);
         ExitCode::FAILURE
     } else {
-        pf_warn!("s"; "server_main exitted successfully");
+        // pf_warn!("s"; "server_main exitted successfully");
         ExitCode::SUCCESS
     }
 }

From c2fbbb0787ae5793c4cd3bc4b518825821e23930 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 14:45:47 +0800
Subject: [PATCH 21/89] minor updates to README

---
 README.md | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 22eb30f3..974767d1 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,10 @@
 This is a private mirror of [Summerset](https://github.com/josehu07/summerset). Below are a memo of development commands...
 
+[![Format check](https://github.com/josehu07/summerset-private/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Aformat)
+[![Build status](https://github.com/josehu07/summerset-private/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Abuild)
+[![Tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Atests)
+[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+
 To create a branch to track public repo `main`, pull new things from it, and merge into the private `main`:
 
 ```bash
@@ -31,9 +36,9 @@ git push origin <PR_name>
 
 # Summerset
 
-[![Format check](https://github.com/josehu07/summerset-private/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Aformat)
-[![Build status](https://github.com/josehu07/summerset-private/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Abuild)
-[![Tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests)
+[![Format check](https://github.com/josehu07/summerset/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Aformat)
+[![Build status](https://github.com/josehu07/summerset/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Abuild)
+[![Tests status](https://github.com/josehu07/summerset/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests)
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 
 Summerset is a distributed key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added.

From c2a36f76b8015da9c285fea05670e2abcb40d7f1 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 19:45:08 +0800
Subject: [PATCH 22/89] add performance delay simulation support

---
 Cargo.lock                   | 119 +++++++++++++++++++++++++++++++++++
 Cargo.toml                   |   1 +
 scripts/local_bench.tmp.py   |  12 ++++
 src/protocols/crossword.rs   |  48 +++++++++++---
 src/protocols/multipaxos.rs  |  46 +++++++++++---
 src/protocols/rep_nothing.rs |  26 ++++++--
 src/protocols/rs_paxos.rs    |  46 +++++++++++---
 src/protocols/simple_push.rs |  46 +++++++++++---
 src/server/external.rs       |   6 +-
 src/server/statemach.rs      |   6 +-
 src/server/storage.rs        |  48 +++++++++++---
 src/server/transport.rs      |  82 ++++++++++++++++++------
 src/utils/rscoding.rs        |  20 ++++++
 13 files changed, 439 insertions(+), 67 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 883efd71..79437b05 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -97,6 +97,34 @@ dependencies = [
  "syn 2.0.28",
 ]
 
+[[package]]
+name = "attribute-derive"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c124f12ade4e670107b132722d0ad1a5c9790bcbc1b265336369ea05626b4498"
+dependencies = [
+ "attribute-derive-macro",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.28",
+]
+
+[[package]]
+name = "attribute-derive-macro"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b217a07446e0fb086f83401a98297e2d81492122f5874db5391bd270a185f88"
+dependencies = [
+ "collection_literals",
+ "interpolator",
+ "proc-macro-error",
+ "proc-macro-utils",
+ "proc-macro2",
+ "quote",
+ "quote-use",
+ "syn 2.0.28",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -201,6 +229,12 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
 
+[[package]]
+name = "collection_literals"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "186dce98367766de751c42c4f03970fc60fc012296e706ccbb9d5df9b6c1e271"
+
 [[package]]
 name = "color-print"
 version = "0.3.4"
@@ -240,6 +274,17 @@ dependencies = [
  "windows-sys",
 ]
 
+[[package]]
+name = "derive-where"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "875a0460143f2dbcc71fd8a63f34b7c83ac66f14bead94054e7cd619c57bbb27"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.28",
+]
+
 [[package]]
 name = "dirs"
 version = "4.0.0"
@@ -426,6 +471,26 @@ dependencies = [
  "windows",
 ]
 
+[[package]]
+name = "get-size"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b61e2dab7eedce93a83ab3468b919873ff16bac5a3e704011ff836d22b2120"
+dependencies = [
+ "get-size-derive",
+]
+
+[[package]]
+name = "get-size-derive"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13a1bcfb855c1f340d5913ab542e36f25a1c56f57de79022928297632435dec2"
+dependencies = [
+ "attribute-derive",
+ "quote",
+ "syn 2.0.28",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.2.10"
@@ -495,6 +560,12 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "interpolator"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71dd52191aae121e8611f1e8dc3e324dd0dd1dee1e6dd91d10ee07a3cfb4d9d8"
+
 [[package]]
 name = "is-terminal"
 version = "0.4.9"
@@ -808,6 +879,41 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-utils"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f59e109e2f795a5070e69578c4dc101068139f74616778025ae1011d4cd41a8"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "smallvec",
+]
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.66"
@@ -826,6 +932,18 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "quote-use"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "58e9a38ef862d7fec635661503289062bc5b3035e61859a8de3d3f81823accd2"
+dependencies = [
+ "derive-where",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.28",
+]
+
 [[package]]
 name = "rand"
 version = "0.8.5"
@@ -1120,6 +1238,7 @@ dependencies = [
  "fixedbitset",
  "flashmap",
  "futures",
+ "get-size",
  "lazy_static",
  "log",
  "rand",
diff --git a/Cargo.toml b/Cargo.toml
index 707f1150..663e5da3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,3 +23,4 @@ toml = { version = "0.7", features = ["parse"] }
 log = "0.4"
 reed-solomon-erasure = { version = "6.0", features = ["simd-accel"] }
 ctrlc = { version = "3.4", features = ["termination"] }
+get-size = { version = "0.1", features = ["derive"] }
diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index a76b77ea..7107732b 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -3,6 +3,12 @@
 import statistics
 
 
+PERF_STORAGE_ALPHA = 0
+PERF_STORAGE_BETA = 0
+PERF_NETWORK_ALPHA = 10000
+PERF_NETWORK_BETA = 100
+
+
 def do_cargo_build():
     print("Building everything...")
     cmd = ["cargo", "build", "--workspace", "-r"]
@@ -125,6 +131,12 @@ def bench_round(
         configs.append(f"fault_tolerance={fault_tolerance}")
     if shards_per_replica is not None:
         configs.append(f"shards_per_replica={shards_per_replica}")
+
+    configs.append(f"perf_storage_a={PERF_STORAGE_ALPHA}")
+    configs.append(f"perf_storage_b={PERF_STORAGE_BETA}")
+    configs.append(f"perf_network_a={PERF_NETWORK_ALPHA}")
+    configs.append(f"perf_network_b={PERF_NETWORK_BETA}")
+
     proc_cluster = launch_cluster(protocol, num_replicas, "+".join(configs))
     wait_cluster_setup(proc_cluster, num_replicas)
 
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 493213d4..5aff1728 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -19,6 +19,8 @@ use crate::protocols::SmrProtocol;
 
 use async_trait::async_trait;
 
+use get_size::GetSize;
+
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
@@ -47,6 +49,12 @@ pub struct ReplicaConfigCrossword {
     /// Number of shards to assign to each replica.
     // TODO: proper config options.
     pub shards_per_replica: u8,
+
+    // Performance simulation params (all zeros means no perf simulation):
+    pub perf_storage_a: u64,
+    pub perf_storage_b: u64,
+    pub perf_network_a: u64,
+    pub perf_network_b: u64,
 }
 
 #[allow(clippy::derivable_impls)]
@@ -59,6 +67,10 @@ impl Default for ReplicaConfigCrossword {
             logger_sync: false,
             fault_tolerance: 0,
             shards_per_replica: 1,
+            perf_storage_a: 0,
+            perf_storage_b: 0,
+            perf_network_a: 0,
+            perf_network_b: 0,
         }
     }
 }
@@ -122,7 +134,7 @@ struct Instance {
 }
 
 /// Stable storage log entry type.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum LogEntry {
     /// Records an update to the largest prepare ballot seen.
     PrepareBal { slot: usize, ballot: Ballot },
@@ -139,7 +151,7 @@ enum LogEntry {
 }
 
 /// Peer-peer message type.
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
 enum PeerMsg {
     /// Prepare message from leader to replicas.
     Prepare { slot: usize, ballot: Ballot },
@@ -1174,8 +1186,10 @@ impl GenericReplica for CrosswordReplica {
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigCrossword;
                                     batch_interval_us, max_batch_size,
-                                    backer_path, logger_sync, fault_tolerance,
-                                    shards_per_replica)?;
+                                    backer_path, logger_sync,
+                                    fault_tolerance, shards_per_replica,
+                                    perf_storage_a, perf_storage_b,
+                                    perf_network_a, perf_network_b)?;
         if config.batch_interval_us == 0 {
             return logged_err!(
                 id;
@@ -1188,13 +1202,29 @@ impl GenericReplica for CrosswordReplica {
         let state_machine = StateMachine::new_and_setup(id).await?;
 
         // setup storage hub module
-        let storage_hub =
-            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
-                .await?;
+        let storage_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.backer_path),
+            if config.perf_storage_a == 0 && config.perf_storage_b == 0 {
+                None
+            } else {
+                Some((config.perf_storage_a, config.perf_storage_b))
+            },
+        )
+        .await?;
 
         // setup transport hub module
-        let mut transport_hub =
-            TransportHub::new_and_setup(id, population, p2p_addr).await?;
+        let mut transport_hub = TransportHub::new_and_setup(
+            id,
+            population,
+            p2p_addr,
+            if config.perf_network_a == 0 && config.perf_network_b == 0 {
+                None
+            } else {
+                Some((config.perf_network_a, config.perf_network_b))
+            },
+        )
+        .await?;
 
         // ask for the list of peers to proactively connect to. Do this after
         // transport hub has been set up, so that I will be able to accept
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 2431ff86..dfcad00c 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -23,6 +23,8 @@ use crate::protocols::SmrProtocol;
 
 use async_trait::async_trait;
 
+use get_size::GetSize;
+
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
@@ -42,6 +44,12 @@ pub struct ReplicaConfigMultiPaxos {
 
     /// Whether to call `fsync()`/`fdatasync()` on logger.
     pub logger_sync: bool,
+
+    // Performance simulation params (all zeros means no perf simulation):
+    pub perf_storage_a: u64,
+    pub perf_storage_b: u64,
+    pub perf_network_a: u64,
+    pub perf_network_b: u64,
 }
 
 #[allow(clippy::derivable_impls)]
@@ -52,6 +60,10 @@ impl Default for ReplicaConfigMultiPaxos {
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.multipaxos.wal".into(),
             logger_sync: false,
+            perf_storage_a: 0,
+            perf_storage_b: 0,
+            perf_network_a: 0,
+            perf_network_b: 0,
         }
     }
 }
@@ -114,7 +126,7 @@ struct Instance {
 }
 
 /// Stable storage log entry type.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum LogEntry {
     /// Records an update to the largest prepare ballot seen.
     PrepareBal { slot: usize, ballot: Ballot },
@@ -131,7 +143,7 @@ enum LogEntry {
 }
 
 /// Peer-peer message type.
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
 enum PeerMsg {
     /// Prepare message from leader to replicas.
     Prepare { slot: usize, ballot: Ballot },
@@ -989,7 +1001,9 @@ impl GenericReplica for MultiPaxosReplica {
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigMultiPaxos;
                                     batch_interval_us, max_batch_size,
-                                    backer_path, logger_sync)?;
+                                    backer_path, logger_sync,
+                                    perf_storage_a, perf_storage_b,
+                                    perf_network_a, perf_network_b)?;
         if config.batch_interval_us == 0 {
             return logged_err!(
                 id;
@@ -1002,13 +1016,29 @@ impl GenericReplica for MultiPaxosReplica {
         let state_machine = StateMachine::new_and_setup(id).await?;
 
         // setup storage hub module
-        let storage_hub =
-            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
-                .await?;
+        let storage_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.backer_path),
+            if config.perf_storage_a == 0 && config.perf_storage_b == 0 {
+                None
+            } else {
+                Some((config.perf_storage_a, config.perf_storage_b))
+            },
+        )
+        .await?;
 
         // setup transport hub module
-        let mut transport_hub =
-            TransportHub::new_and_setup(id, population, p2p_addr).await?;
+        let mut transport_hub = TransportHub::new_and_setup(
+            id,
+            population,
+            p2p_addr,
+            if config.perf_network_a == 0 && config.perf_network_b == 0 {
+                None
+            } else {
+                Some((config.perf_network_a, config.perf_network_b))
+            },
+        )
+        .await?;
 
         // ask for the list of peers to proactively connect to. Do this after
         // transport hub has been set up, so that I will be able to accept
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index ffbc57e1..a6f19997 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -18,6 +18,8 @@ use crate::protocols::SmrProtocol;
 
 use async_trait::async_trait;
 
+use get_size::GetSize;
+
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
@@ -37,6 +39,10 @@ pub struct ReplicaConfigRepNothing {
 
     /// Whether to call `fsync()`/`fdatasync()` on logger.
     pub logger_sync: bool,
+
+    // Performance simulation params (all zeros means no perf simulation):
+    pub perf_storage_a: u64,
+    pub perf_storage_b: u64,
 }
 
 #[allow(clippy::derivable_impls)]
@@ -47,12 +53,14 @@ impl Default for ReplicaConfigRepNothing {
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.rep_nothing.wal".into(),
             logger_sync: false,
+            perf_storage_a: 0,
+            perf_storage_b: 0,
         }
     }
 }
 
 /// Log entry type.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 struct LogEntry {
     reqs: Vec<(ClientId, ApiRequest)>,
 }
@@ -296,7 +304,8 @@ impl GenericReplica for RepNothingReplica {
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigRepNothing;
                                     batch_interval_us, max_batch_size,
-                                    backer_path, logger_sync)?;
+                                    backer_path, logger_sync,
+                                    perf_storage_a, perf_storage_b)?;
         if config.batch_interval_us == 0 {
             return logged_err!(
                 id;
@@ -309,9 +318,16 @@ impl GenericReplica for RepNothingReplica {
         let state_machine = StateMachine::new_and_setup(id).await?;
 
         // setup storage hub module
-        let storage_hub =
-            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
-                .await?;
+        let storage_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.backer_path),
+            if config.perf_storage_a == 0 && config.perf_storage_b == 0 {
+                None
+            } else {
+                Some((config.perf_storage_a, config.perf_storage_b))
+            },
+        )
+        .await?;
 
         // TransportHub is not needed in RepNothing
 
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index ada30d45..79d025de 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -19,6 +19,8 @@ use crate::protocols::SmrProtocol;
 
 use async_trait::async_trait;
 
+use get_size::GetSize;
+
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
@@ -43,6 +45,12 @@ pub struct ReplicaConfigRSPaxos {
 
     /// Fault-tolerance level.
     pub fault_tolerance: u8,
+
+    // Performance simulation params (all zeros means no perf simulation):
+    pub perf_storage_a: u64,
+    pub perf_storage_b: u64,
+    pub perf_network_a: u64,
+    pub perf_network_b: u64,
 }
 
 #[allow(clippy::derivable_impls)]
@@ -54,6 +62,10 @@ impl Default for ReplicaConfigRSPaxos {
             backer_path: "/tmp/summerset.rs_paxos.wal".into(),
             logger_sync: false,
             fault_tolerance: 0,
+            perf_storage_a: 0,
+            perf_storage_b: 0,
+            perf_network_a: 0,
+            perf_network_b: 0,
         }
     }
 }
@@ -116,7 +128,7 @@ struct Instance {
 }
 
 /// Stable storage log entry type.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum LogEntry {
     /// Records an update to the largest prepare ballot seen.
     PrepareBal { slot: usize, ballot: Ballot },
@@ -133,7 +145,7 @@ enum LogEntry {
 }
 
 /// Peer-peer message type.
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
 enum PeerMsg {
     /// Prepare message from leader to replicas.
     Prepare { slot: usize, ballot: Ballot },
@@ -1074,7 +1086,9 @@ impl GenericReplica for RSPaxosReplica {
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigRSPaxos;
                                     batch_interval_us, max_batch_size,
-                                    backer_path, logger_sync, fault_tolerance)?;
+                                    backer_path, logger_sync, fault_tolerance,
+                                    perf_storage_a, perf_storage_b,
+                                    perf_network_a, perf_network_b)?;
         if config.batch_interval_us == 0 {
             return logged_err!(
                 id;
@@ -1087,13 +1101,29 @@ impl GenericReplica for RSPaxosReplica {
         let state_machine = StateMachine::new_and_setup(id).await?;
 
         // setup storage hub module
-        let storage_hub =
-            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
-                .await?;
+        let storage_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.backer_path),
+            if config.perf_storage_a == 0 && config.perf_storage_b == 0 {
+                None
+            } else {
+                Some((config.perf_storage_a, config.perf_storage_b))
+            },
+        )
+        .await?;
 
         // setup transport hub module
-        let mut transport_hub =
-            TransportHub::new_and_setup(id, population, p2p_addr).await?;
+        let mut transport_hub = TransportHub::new_and_setup(
+            id,
+            population,
+            p2p_addr,
+            if config.perf_network_a == 0 && config.perf_network_b == 0 {
+                None
+            } else {
+                Some((config.perf_network_a, config.perf_network_b))
+            },
+        )
+        .await?;
 
         // ask for the list of peers to proactively connect to. Do this after
         // transport hub has been set up, so that I will be able to accept
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index 7d9aa763..73c1d068 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -19,6 +19,8 @@ use crate::protocols::SmrProtocol;
 
 use async_trait::async_trait;
 
+use get_size::GetSize;
+
 use serde::{Serialize, Deserialize};
 
 use tokio::time::Duration;
@@ -38,6 +40,12 @@ pub struct ReplicaConfigSimplePush {
 
     /// Number of peer servers to push each command to.
     pub rep_degree: u8,
+
+    // Performance simulation params (all zeros means no perf simulation):
+    pub perf_storage_a: u64,
+    pub perf_storage_b: u64,
+    pub perf_network_a: u64,
+    pub perf_network_b: u64,
 }
 
 #[allow(clippy::derivable_impls)]
@@ -48,12 +56,16 @@ impl Default for ReplicaConfigSimplePush {
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.simple_push.wal".into(),
             rep_degree: 2,
+            perf_storage_a: 0,
+            perf_storage_b: 0,
+            perf_network_a: 0,
+            perf_network_b: 0,
         }
     }
 }
 
 /// Log entry type.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum LogEntry {
     FromClient {
         reqs: Vec<(ClientId, ApiRequest)>,
@@ -65,7 +77,7 @@ enum LogEntry {
 }
 
 /// Peer-peer message type.
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
 enum PushMsg {
     Push {
         src_inst_idx: usize,
@@ -456,7 +468,9 @@ impl GenericReplica for SimplePushReplica {
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigSimplePush;
                                     batch_interval_us, max_batch_size,
-                                    backer_path, rep_degree)?;
+                                    backer_path, rep_degree,
+                                    perf_storage_a, perf_storage_b,
+                                    perf_network_a, perf_network_b)?;
         if config.batch_interval_us == 0 {
             return logged_err!(
                 id;
@@ -469,13 +483,29 @@ impl GenericReplica for SimplePushReplica {
         let state_machine = StateMachine::new_and_setup(id).await?;
 
         // setup storage hub module
-        let storage_hub =
-            StorageHub::new_and_setup(id, Path::new(&config.backer_path))
-                .await?;
+        let storage_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.backer_path),
+            if config.perf_storage_a == 0 && config.perf_storage_b == 0 {
+                None
+            } else {
+                Some((config.perf_storage_a, config.perf_storage_b))
+            },
+        )
+        .await?;
 
         // setup transport hub module
-        let mut transport_hub =
-            TransportHub::new_and_setup(id, population, p2p_addr).await?;
+        let mut transport_hub = TransportHub::new_and_setup(
+            id,
+            population,
+            p2p_addr,
+            if config.perf_network_a == 0 && config.perf_network_b == 0 {
+                None
+            } else {
+                Some((config.perf_network_a, config.perf_network_b))
+            },
+        )
+        .await?;
 
         // ask for the list of peers to proactively connect to. Do this after
         // transport hub has been set up, so that I will be able to accept
diff --git a/src/server/external.rs b/src/server/external.rs
index c52a946c..769c1cd7 100644
--- a/src/server/external.rs
+++ b/src/server/external.rs
@@ -9,6 +9,8 @@ use crate::utils::{
 use crate::server::{ReplicaId, Command, CommandResult};
 use crate::client::ClientId;
 
+use get_size::GetSize;
+
 use bytes::BytesMut;
 
 use serde::{Serialize, Deserialize};
@@ -26,7 +28,7 @@ pub type RequestId = u64;
 
 /// Request received from client.
 // TODO: add information fields such as read-only flag...
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 pub enum ApiRequest {
     /// Regular request.
     Req {
@@ -42,7 +44,7 @@ pub enum ApiRequest {
 }
 
 /// Reply back to client.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 pub enum ApiReply {
     /// Reply to regular request.
     Reply {
diff --git a/src/server/statemach.rs b/src/server/statemach.rs
index fd2c7670..47196cf4 100644
--- a/src/server/statemach.rs
+++ b/src/server/statemach.rs
@@ -5,6 +5,8 @@ use std::collections::HashMap;
 use crate::utils::SummersetError;
 use crate::server::ReplicaId;
 
+use get_size::GetSize;
+
 use serde::{Serialize, Deserialize};
 
 use tokio::sync::mpsc;
@@ -14,7 +16,7 @@ use tokio::task::JoinHandle;
 pub type CommandId = u64;
 
 /// Command to the state machine.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 pub enum Command {
     /// Get the value of given key.
     Get { key: String },
@@ -24,7 +26,7 @@ pub enum Command {
 }
 
 /// Command execution result returned by the state machine.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 pub enum CommandResult {
     /// `Some(value)` if key is found in state machine, else `None`.
     Get { value: Option<String> },
diff --git a/src/server/storage.rs b/src/server/storage.rs
index 8522175e..5d0fc3cb 100644
--- a/src/server/storage.rs
+++ b/src/server/storage.rs
@@ -3,10 +3,13 @@
 use std::fmt;
 use std::path::Path;
 use std::io::SeekFrom;
+use std::sync::Arc;
 
 use crate::utils::SummersetError;
 use crate::server::ReplicaId;
 
+use get_size::GetSize;
+
 use serde::{Serialize, Deserialize, de::DeserializeOwned};
 
 use rmp_serde::encode::to_vec as encode_to_vec;
@@ -16,13 +19,14 @@ use tokio::fs::{self, File, OpenOptions};
 use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt};
 use tokio::sync::mpsc;
 use tokio::task::JoinHandle;
+use tokio::time::{self, Duration};
 
 /// Log action ID type.
 pub type LogActionId = u64;
 
 /// Action command to the logger. File cursor will be positioned at EOF after
 /// every action.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, GetSize)]
 pub enum LogAction<Ent> {
     /// Read a log entry out.
     Read { offset: usize },
@@ -45,7 +49,7 @@ pub enum LogAction<Ent> {
 }
 
 /// Action result returned by the logger.
-#[derive(Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Serialize, Deserialize, PartialEq, GetSize)]
 pub enum LogResult<Ent> {
     /// `Some(entry)` if successful, else `None`.
     Read { entry: Option<Ent> },
@@ -88,6 +92,7 @@ where
         + Clone
         + Serialize
         + DeserializeOwned
+        + GetSize
         + Send
         + Sync
         + 'static,
@@ -99,6 +104,7 @@ where
     pub async fn new_and_setup(
         me: ReplicaId,
         path: &Path,
+        perf_a_b: Option<(u64, u64)>, // performance simulation params
     ) -> Result<Self, SummersetError> {
         // prepare backing file
         if !fs::try_exists(path).await? {
@@ -111,11 +117,39 @@ where
             OpenOptions::new().read(true).write(true).open(path).await?;
         backer_file.seek(SeekFrom::End(0)).await?; // seek to EOF
 
-        let (tx_log, rx_log) = mpsc::unbounded_channel();
+        let (tx_log, mut rx_log) =
+            mpsc::unbounded_channel::<(LogActionId, LogAction<Ent>)>();
         let (tx_ack, rx_ack) = mpsc::unbounded_channel();
 
-        let logger_handle =
-            tokio::spawn(Self::logger_thread(me, backer_file, rx_log, tx_ack));
+        // if doing performance delay simulation, add on-the-fly delay to
+        // each message received
+        let rx_log_true = if let Some((perf_a, perf_b)) = perf_a_b {
+            let (tx_log_delayed, rx_log_delayed) = mpsc::unbounded_channel();
+            let tx_log_delayed_arc = Arc::new(tx_log_delayed);
+
+            tokio::spawn(async move {
+                while let Some((id, log_action)) = rx_log.recv().await {
+                    let tx_log_delayed_clone = tx_log_delayed_arc.clone();
+                    tokio::spawn(async move {
+                        let approx_size = log_action.get_size() as u64;
+                        let delay_ns = perf_a + approx_size * perf_b;
+                        time::sleep(Duration::from_nanos(delay_ns)).await;
+                        tx_log_delayed_clone.send((id, log_action)).unwrap();
+                    });
+                }
+            });
+
+            rx_log_delayed
+        } else {
+            rx_log
+        };
+
+        let logger_handle = tokio::spawn(Self::logger_thread(
+            me,
+            backer_file,
+            rx_log_true,
+            tx_ack,
+        ));
 
         Ok(StorageHub {
             me,
@@ -426,7 +460,7 @@ mod storage_tests {
     use super::*;
     use rmp_serde::encode::to_vec as encode_to_vec;
 
-    #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+    #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, GetSize)]
     struct TestEntry(String);
 
     async fn prepare_test_file(path: &str) -> Result<File, SummersetError> {
@@ -649,7 +683,7 @@ mod storage_tests {
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn api_log_ack() -> Result<(), SummersetError> {
         let path = Path::new("/tmp/test-backer-6.log");
-        let mut hub = StorageHub::new_and_setup(0, path).await?;
+        let mut hub = StorageHub::new_and_setup(0, path, None).await?;
         let entry = TestEntry("abcdefgh".into());
         let entry_bytes = encode_to_vec(&entry)?;
         hub.submit_action(0, LogAction::Append { entry, sync: true })?;
diff --git a/src/server/transport.rs b/src/server/transport.rs
index e7ca2998..a6a30ec8 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -2,6 +2,7 @@
 
 use std::fmt;
 use std::net::SocketAddr;
+use std::sync::Arc;
 
 use crate::utils::{
     SummersetError, Bitmap, safe_tcp_read, safe_tcp_write, tcp_bind_with_retry,
@@ -9,6 +10,8 @@ use crate::utils::{
 };
 use crate::server::ReplicaId;
 
+use get_size::GetSize;
+
 use bytes::BytesMut;
 
 use serde::{Serialize, Deserialize, de::DeserializeOwned};
@@ -21,7 +24,7 @@ use tokio::task::JoinHandle;
 use tokio::time::{self, Duration};
 
 /// Peer-peer message wrapper type that includes leave notification variants.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum PeerMessage<Msg> {
     /// Normal protocol-specific request.
     Msg { msg: Msg },
@@ -74,6 +77,7 @@ where
         + Clone
         + Serialize
         + DeserializeOwned
+        + GetSize
         + Send
         + Sync
         + 'static,
@@ -85,12 +89,38 @@ where
         me: ReplicaId,
         population: u8,
         p2p_addr: SocketAddr,
+        perf_a_b: Option<(u64, u64)>, // performance simulation params
     ) -> Result<Self, SummersetError> {
         if population <= me {
             return logged_err!(me; "invalid population {}", population);
         }
 
-        let (tx_recv, rx_recv) = mpsc::unbounded_channel();
+        let (tx_recv, mut rx_recv) =
+            mpsc::unbounded_channel::<(ReplicaId, PeerMessage<Msg>)>();
+
+        // if doing performance delay simulation, add on-the-fly delay to
+        // each message received
+        let rx_recv_true = if let Some((perf_a, perf_b)) = perf_a_b {
+            let (tx_recv_delayed, rx_recv_delayed) = mpsc::unbounded_channel();
+            let tx_recv_delayed_arc = Arc::new(tx_recv_delayed);
+
+            tokio::spawn(async move {
+                while let Some((id, peer_msg)) = rx_recv.recv().await {
+                    let tx_recv_delayed_clone = tx_recv_delayed_arc.clone();
+                    tokio::spawn(async move {
+                        let approx_size = peer_msg.get_size() as u64;
+                        let delay_ns = perf_a + approx_size * perf_b;
+                        time::sleep(Duration::from_nanos(delay_ns)).await;
+                        tx_recv_delayed_clone.send((id, peer_msg)).unwrap();
+                    });
+                }
+                pf_error!("d"; "recv channel has been closed");
+            });
+
+            rx_recv_delayed
+        } else {
+            rx_recv
+        };
 
         let (tx_sends_write, tx_sends_read) = flashmap::new::<
             ReplicaId,
@@ -119,7 +149,7 @@ where
         Ok(TransportHub {
             me,
             population,
-            rx_recv,
+            rx_recv: rx_recv_true,
             tx_sends: tx_sends_read,
             _peer_acceptor_handle: peer_acceptor_handle,
             tx_connect,
@@ -661,7 +691,7 @@ mod transport_tests {
     use serde::{Serialize, Deserialize};
     use tokio::sync::Barrier;
 
-    #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+    #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, GetSize)]
     struct TestMsg(String);
 
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
@@ -671,9 +701,13 @@ mod transport_tests {
         let barrier2 = barrier.clone();
         tokio::spawn(async move {
             // replica 1
-            let mut hub: TransportHub<TestMsg> =
-                TransportHub::new_and_setup(1, 3, "127.0.0.1:53801".parse()?)
-                    .await?;
+            let mut hub: TransportHub<TestMsg> = TransportHub::new_and_setup(
+                1,
+                3,
+                "127.0.0.1:53801".parse()?,
+                None,
+            )
+            .await?;
             barrier1.wait().await;
             hub.connect_to_peer(2, "127.0.0.1:53802".parse()?).await?;
             // recv a message from 0
@@ -696,9 +730,13 @@ mod transport_tests {
         });
         tokio::spawn(async move {
             // replica 2
-            let mut hub: TransportHub<TestMsg> =
-                TransportHub::new_and_setup(2, 3, "127.0.0.1:53802".parse()?)
-                    .await?;
+            let mut hub: TransportHub<TestMsg> = TransportHub::new_and_setup(
+                2,
+                3,
+                "127.0.0.1:53802".parse()?,
+                None,
+            )
+            .await?;
             barrier2.wait().await;
             // recv a message from 0
             let (id, msg) = hub.recv_msg().await?;
@@ -714,7 +752,7 @@ mod transport_tests {
         });
         // replica 0
         let mut hub: TransportHub<TestMsg> =
-            TransportHub::new_and_setup(0, 3, "127.0.0.1:53800".parse()?)
+            TransportHub::new_and_setup(0, 3, "127.0.0.1:53800".parse()?, None)
                 .await?;
         barrier.wait().await;
         hub.connect_to_peer(1, "127.0.0.1:53801".parse()?).await?;
@@ -747,9 +785,13 @@ mod transport_tests {
         let barrier2 = barrier.clone();
         tokio::spawn(async move {
             // replica 1/2
-            let mut hub: TransportHub<TestMsg> =
-                TransportHub::new_and_setup(1, 3, "127.0.0.1:54801".parse()?)
-                    .await?;
+            let mut hub: TransportHub<TestMsg> = TransportHub::new_and_setup(
+                1,
+                3,
+                "127.0.0.1:54801".parse()?,
+                None,
+            )
+            .await?;
             barrier2.wait().await;
             // recv a message from 0
             let (id, msg) = hub.recv_msg().await?;
@@ -759,9 +801,13 @@ mod transport_tests {
             // leave and come back as 2
             hub.leave().await?;
             time::sleep(Duration::from_millis(100)).await;
-            let mut hub: TransportHub<TestMsg> =
-                TransportHub::new_and_setup(2, 3, "127.0.0.1:54802".parse()?)
-                    .await?;
+            let mut hub: TransportHub<TestMsg> = TransportHub::new_and_setup(
+                2,
+                3,
+                "127.0.0.1:54802".parse()?,
+                None,
+            )
+            .await?;
             hub.connect_to_peer(0, "127.0.0.1:54800".parse()?).await?;
             // send a message to 0
             hub.send_msg(TestMsg("hello".into()), 0)?;
@@ -769,7 +815,7 @@ mod transport_tests {
         });
         // replica 0
         let mut hub: TransportHub<TestMsg> =
-            TransportHub::new_and_setup(0, 3, "127.0.0.1:54800".parse()?)
+            TransportHub::new_and_setup(0, 3, "127.0.0.1:54800".parse()?, None)
                 .await?;
         barrier.wait().await;
         hub.connect_to_peer(1, "127.0.0.1:54801".parse()?).await?;
diff --git a/src/utils/rscoding.rs b/src/utils/rscoding.rs
index 49c008a3..c8461c26 100644
--- a/src/utils/rscoding.rs
+++ b/src/utils/rscoding.rs
@@ -6,6 +6,8 @@ use std::marker::PhantomData;
 
 use crate::utils::{SummersetError, Bitmap};
 
+use get_size::GetSize;
+
 use bytes::{BytesMut, BufMut};
 
 use serde::{Serialize, Deserialize, de::DeserializeOwned};
@@ -43,6 +45,24 @@ pub struct RSCodeword<T> {
     phantom: PhantomData<T>,
 }
 
+// implement `GetSize` trait for `RSCodeword`; the heap size is approximated
+// simply by the sum of sizes of present shards
+impl<T> GetSize for RSCodeword<T>
+where
+    T: fmt::Debug + Clone + Serialize + DeserializeOwned + Send + Sync,
+{
+    fn get_heap_size(&self) -> usize {
+        self.shards
+            .iter()
+            .map(|s| if let Some(b) = s { b.len() } else { 0 })
+            .sum()
+    }
+
+    fn get_size(&self) -> usize {
+        Self::get_stack_size() + self.get_heap_size()
+    }
+}
+
 impl<T> RSCodeword<T>
 where
     T: fmt::Debug + Clone + Serialize + DeserializeOwned + Send + Sync,

From 6bb7d3556aea1ad4df6268fb075370b64a40c5d4 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 20:39:17 +0800
Subject: [PATCH 23/89] minor updates to bench script

---
 scripts/local_bench.tmp.py | 3 ++-
 scripts/local_cluster.py   | 2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 7107732b..fef021c1 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -1,3 +1,4 @@
+import sys
 import os
 import subprocess
 import statistics
@@ -50,7 +51,7 @@ def wait_cluster_setup(proc, num_replicas):
 
     for line in iter(proc.stderr.readline, b""):
         l = line.decode()
-        # print(l, end="")
+        # print(l, end="", file=sys.stderr)
         if "manager" not in l and "accepting clients" in l:
             replica = int(l[l.find("(") + 1 : l.find(")")])
             assert not accepting_clients[replica]
diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index c4e0877c..9df8fe39 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -181,8 +181,6 @@ def launch_servers(protocol, num_replicas, release, config):
     def kill_spawned_procs(*args):
         for proc in server_procs:
             proc.terminate()
-        for proc in server_procs:
-            proc.wait()
         manager_proc.terminate()
 
     signal.signal(signal.SIGINT, kill_spawned_procs)

From 27a0a2442479524b5a75bf5e1dc2c46900914052 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 20:43:31 +0800
Subject: [PATCH 24/89] minor updates to bench script

---
 scripts/local_cluster.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index 9df8fe39..c4e0877c 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -181,6 +181,8 @@ def launch_servers(protocol, num_replicas, release, config):
     def kill_spawned_procs(*args):
         for proc in server_procs:
             proc.terminate()
+        for proc in server_procs:
+            proc.wait()
         manager_proc.terminate()
 
     signal.signal(signal.SIGINT, kill_spawned_procs)

From 28a64d1c1dc504f20b3483851deccdf8201ae077 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 21:00:56 +0800
Subject: [PATCH 25/89] fixing bench script hanging issue

---
 scripts/local_bench.tmp.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index fef021c1..e2f6c277 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -142,6 +142,9 @@ def bench_round(
     wait_cluster_setup(proc_cluster, num_replicas)
 
     proc_client = run_bench_client(protocol, value_size, put_ratio, length_s)
+    for line in iter(proc_client.stderr.readline, b""):
+        l = line.decode()
+        print(l, end="", file=sys.stderr)
     out, err = proc_client.communicate()
 
     proc_cluster.terminate()

From 8ace60cc4d7d7508fdbf4f110f2b45873469a5d6 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 21:04:50 +0800
Subject: [PATCH 26/89] fixing bench script hanging issue

---
 scripts/local_bench.tmp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index e2f6c277..b1d59e2c 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -19,7 +19,7 @@ def do_cargo_build():
 
 def run_process(cmd):
     # print("Run:", " ".join(cmd))
-    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     return proc
 
 
@@ -49,7 +49,7 @@ def launch_cluster(protocol, num_replicas, config):
 def wait_cluster_setup(proc, num_replicas):
     accepting_clients = [False for _ in range(num_replicas)]
 
-    for line in iter(proc.stderr.readline, b""):
+    for line in iter(proc.stdout.readline, b""):
         l = line.decode()
         # print(l, end="", file=sys.stderr)
         if "manager" not in l and "accepting clients" in l:
@@ -142,7 +142,7 @@ def bench_round(
     wait_cluster_setup(proc_cluster, num_replicas)
 
     proc_client = run_bench_client(protocol, value_size, put_ratio, length_s)
-    for line in iter(proc_client.stderr.readline, b""):
+    for line in iter(proc_client.stdout.readline, b""):
         l = line.decode()
         print(l, end="", file=sys.stderr)
     out, err = proc_client.communicate()

From 76c3fe99ebb916e04fd2aec9a2f231911aaa6a92 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 21:28:25 +0800
Subject: [PATCH 27/89] fixing bench script hanging issue

---
 scripts/local_bench.tmp.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index b1d59e2c..2d01e4f3 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -19,16 +19,24 @@ def do_cargo_build():
 
 def run_process(cmd):
     # print("Run:", " ".join(cmd))
-    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     return proc
 
 
 def kill_all_matching(name, force=False):
     # print("Kill all:", name)
     assert name.count(" ") == 0
-    cmd = "killall -9" if force else "killall"
-    cmd += f" {name} > /dev/null 2>&1"
-    os.system(cmd)
+
+    pgrep_cmd = ["sudo", "pgrep", "-f", name]
+    pids = subprocess.check_output(pgrep_cmd, shell=True).decode()
+
+    pids = pids.strip().split("\n")
+    for pid in pids:
+        pid = pid.strip()
+        if len(pid) > 0:
+            kill_cmd = f"sudo kill -9" if force else "sudo kill"
+            kill_cmd += f" {int(pid)} > /dev/null 2>&1"
+            os.system(kill_cmd)
 
 
 def launch_cluster(protocol, num_replicas, config):
@@ -49,9 +57,9 @@ def launch_cluster(protocol, num_replicas, config):
 def wait_cluster_setup(proc, num_replicas):
     accepting_clients = [False for _ in range(num_replicas)]
 
-    for line in iter(proc.stdout.readline, b""):
+    for line in iter(proc.stderr.readline, b""):
         l = line.decode()
-        # print(l, end="", file=sys.stderr)
+        print(l, end="", file=sys.stderr)
         if "manager" not in l and "accepting clients" in l:
             replica = int(l[l.find("(") + 1 : l.find(")")])
             assert not accepting_clients[replica]
@@ -123,6 +131,8 @@ def bench_round(
         + f"s={shards_per_replica if shards_per_replica is not None else 'x':1}  "
         + f"w%={put_ratio:<3d}  {length_s:3d}s"
     )
+
+    kill_all_matching("local_cluster.py", force=True)
     kill_all_matching("summerset_client", force=True)
     kill_all_matching("summerset_server", force=True)
     kill_all_matching("summerset_manager", force=True)
@@ -142,9 +152,6 @@ def bench_round(
     wait_cluster_setup(proc_cluster, num_replicas)
 
     proc_client = run_bench_client(protocol, value_size, put_ratio, length_s)
-    for line in iter(proc_client.stdout.readline, b""):
-        l = line.decode()
-        print(l, end="", file=sys.stderr)
     out, err = proc_client.communicate()
 
     proc_cluster.terminate()

From 13ecd40f9b10b77acba1ce9207789080bd65d4e7 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 21:31:09 +0800
Subject: [PATCH 28/89] fixing bench script hanging issue

---
 scripts/local_bench.tmp.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 2d01e4f3..e60f98e6 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -26,17 +26,18 @@ def run_process(cmd):
 def kill_all_matching(name, force=False):
     # print("Kill all:", name)
     assert name.count(" ") == 0
-
     pgrep_cmd = ["sudo", "pgrep", "-f", name]
-    pids = subprocess.check_output(pgrep_cmd, shell=True).decode()
-
-    pids = pids.strip().split("\n")
-    for pid in pids:
-        pid = pid.strip()
-        if len(pid) > 0:
-            kill_cmd = f"sudo kill -9" if force else "sudo kill"
-            kill_cmd += f" {int(pid)} > /dev/null 2>&1"
-            os.system(kill_cmd)
+    try:
+        pids = subprocess.check_output(pgrep_cmd, shell=True).decode()
+        pids = pids.strip().split("\n")
+        for pid in pids:
+            pid = pid.strip()
+            if len(pid) > 0:
+                kill_cmd = f"sudo kill -9" if force else "sudo kill"
+                kill_cmd += f" {int(pid)} > /dev/null 2>&1"
+                os.system(kill_cmd)
+    except subprocess.CalledProcessError:
+        pass
 
 
 def launch_cluster(protocol, num_replicas, config):

From 5777e84cdd79b2463bf1bda50ffa006084631fa6 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 21:33:33 +0800
Subject: [PATCH 29/89] fixing bench script hanging issue

---
 scripts/local_bench.tmp.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index e60f98e6..13c8b4ab 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -28,6 +28,7 @@ def kill_all_matching(name, force=False):
     assert name.count(" ") == 0
     pgrep_cmd = ["sudo", "pgrep", "-f", name]
     try:
+        print("AAA")
         pids = subprocess.check_output(pgrep_cmd, shell=True).decode()
         pids = pids.strip().split("\n")
         for pid in pids:
@@ -35,6 +36,7 @@ def kill_all_matching(name, force=False):
             if len(pid) > 0:
                 kill_cmd = f"sudo kill -9" if force else "sudo kill"
                 kill_cmd += f" {int(pid)} > /dev/null 2>&1"
+                print("BBB", kill_cmd)
                 os.system(kill_cmd)
     except subprocess.CalledProcessError:
         pass

From 461c8cc889106f67aa2b9bbbbf15cee496857e60 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 21:35:31 +0800
Subject: [PATCH 30/89] fixing bench script hanging issue

---
 scripts/local_bench.tmp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 13c8b4ab..ed308dbb 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -28,7 +28,7 @@ def kill_all_matching(name, force=False):
     assert name.count(" ") == 0
     pgrep_cmd = ["sudo", "pgrep", "-f", name]
     try:
-        print("AAA")
+        print("AAA", pgrep_cmd)
         pids = subprocess.check_output(pgrep_cmd, shell=True).decode()
         pids = pids.strip().split("\n")
         for pid in pids:

From 7264252c055548be6055b81f994739df945283e1 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 31 Aug 2023 21:36:18 +0800
Subject: [PATCH 31/89] fixing bench script hanging issue

---
 scripts/local_bench.tmp.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index ed308dbb..671b5497 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -26,9 +26,8 @@ def run_process(cmd):
 def kill_all_matching(name, force=False):
     # print("Kill all:", name)
     assert name.count(" ") == 0
-    pgrep_cmd = ["sudo", "pgrep", "-f", name]
     try:
-        print("AAA", pgrep_cmd)
+        pgrep_cmd = f"sudo pgrep -f {name}"
         pids = subprocess.check_output(pgrep_cmd, shell=True).decode()
         pids = pids.strip().split("\n")
         for pid in pids:
@@ -36,7 +35,6 @@ def kill_all_matching(name, force=False):
             if len(pid) > 0:
                 kill_cmd = f"sudo kill -9" if force else "sudo kill"
                 kill_cmd += f" {int(pid)} > /dev/null 2>&1"
-                print("BBB", kill_cmd)
                 os.system(kill_cmd)
     except subprocess.CalledProcessError:
         pass

From 3c07397c30364f0f58d993885122769c1a7d673e Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Fri, 1 Sep 2023 18:35:21 +0800
Subject: [PATCH 32/89] staging progress on log recovery

---
 src/protocols/crossword.rs   |  1 +
 src/protocols/multipaxos.rs  |  1 +
 src/protocols/rep_nothing.rs | 68 ++++++++++++++++++++++++++++++++++++
 src/protocols/rs_paxos.rs    |  1 +
 src/protocols/simple_push.rs |  1 +
 src/server/storage.rs        | 49 +++++++++++++++-----------
 6 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 5aff1728..012007b4 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -1310,6 +1310,7 @@ impl GenericReplica for CrosswordReplica {
             self.is_leader = true;
         }
 
+        // main event loop
         loop {
             tokio::select! {
                 // client request batch
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index dfcad00c..a0d06a19 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -1105,6 +1105,7 @@ impl GenericReplica for MultiPaxosReplica {
             self.is_leader = true;
         }
 
+        // main event loop
         loop {
             tokio::select! {
                 // client request batch
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index a6f19997..849e3e05 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -287,6 +287,70 @@ impl RepNothingReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+
+    /// Recover state from durable storage log.
+    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.log_offset, 0);
+        loop {
+            // using 0 as a special log action ID
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Read {
+                    offset: self.log_offset,
+                },
+            )?;
+            let (_, log_result) = self.storage_hub.get_result().await?;
+
+            match log_result {
+                LogResult::Read {
+                    entry: Some(entry),
+                    end_offset,
+                } => {
+                    // execute all commands on state machine synchronously
+                    for (_, req) in entry.reqs.clone() {
+                        if let ApiRequest::Req { cmd, .. } = req {
+                            // using 0 as a special command ID
+                            self.state_machine.submit_cmd(0, cmd)?;
+                            let _ = self.state_machine.get_result().await?;
+                        }
+                    }
+                    // rebuild in-memory log
+                    let num_reqs = entry.reqs.len();
+                    self.insts.push(Instance {
+                        reqs: entry.reqs,
+                        durable: true,
+                        execed: vec![true; num_reqs],
+                    });
+                    // update log offset
+                    self.log_offset = end_offset;
+                }
+                LogResult::Read { entry: None, .. } => {
+                    // end of log reached
+                    break;
+                }
+                _ => {
+                    return logged_err!(self.id; "unexpected log result type");
+                }
+            }
+        }
+
+        // do an extra Truncate to remove paritial entry at the end if any
+        self.storage_hub.submit_action(
+            0,
+            LogAction::Truncate {
+                offset: self.log_offset,
+            },
+        )?;
+        let (_, log_result) = self.storage_hub.get_result().await?;
+        if let LogResult::Truncate {
+            offset_ok: true, ..
+        } = log_result
+        {
+            Ok(())
+        } else {
+            logged_err!(self.id; "unexpected log result type")
+        }
+    }
 }
 
 #[async_trait]
@@ -367,6 +431,10 @@ impl GenericReplica for RepNothingReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
+        // recover state from durable storage log
+        self.recover_from_log().await?;
+
+        // main event loop
         loop {
             tokio::select! {
                 // client request batch
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 79d025de..8dbbafb8 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -1203,6 +1203,7 @@ impl GenericReplica for RSPaxosReplica {
             self.is_leader = true;
         }
 
+        // main event loop
         loop {
             tokio::select! {
                 // client request batch
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index 73c1d068..9deb6775 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -560,6 +560,7 @@ impl GenericReplica for SimplePushReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
+        // main event loop
         loop {
             tokio::select! {
                 // client request batch
diff --git a/src/server/storage.rs b/src/server/storage.rs
index 5d0fc3cb..99809e65 100644
--- a/src/server/storage.rs
+++ b/src/server/storage.rs
@@ -52,7 +52,10 @@ pub enum LogAction<Ent> {
 #[derive(Debug, Serialize, Deserialize, PartialEq, GetSize)]
 pub enum LogResult<Ent> {
     /// `Some(entry)` if successful, else `None`.
-    Read { entry: Option<Ent> },
+    Read {
+        entry: Option<Ent>,
+        end_offset: usize,
+    },
 
     /// `ok` is true if offset is valid, else false. `now_size` is the size
     /// of file after this.
@@ -198,7 +201,7 @@ where
         backer: &mut File,
         file_size: usize,
         offset: usize,
-    ) -> Result<Option<Ent>, SummersetError> {
+    ) -> Result<(Option<Ent>, usize), SummersetError> {
         if offset + 8 > file_size {
             pf_warn!(
                 me;
@@ -206,7 +209,7 @@ where
                 offset + 8,
                 file_size
             );
-            return Ok(None);
+            return Ok((None, offset));
         }
 
         // read entry length header
@@ -216,7 +219,7 @@ where
         if offset_e > file_size {
             pf_warn!(me; "read entry invalid length {}", entry_len);
             backer.seek(SeekFrom::End(0)).await?; // recover cursor to EOF
-            return Ok(None);
+            return Ok((None, offset));
         }
 
         // read entry content
@@ -224,7 +227,7 @@ where
         backer.read_exact(&mut entry_buf[..]).await?;
         let entry = decode_from_slice(&entry_buf)?;
         backer.seek(SeekFrom::End(0)).await?; // recover cursor to EOF
-        Ok(Some(entry))
+        Ok((Some(entry), offset_e))
     }
 
     /// Write given entry to given offset.
@@ -366,9 +369,9 @@ where
     ) -> Result<LogResult<Ent>, SummersetError> {
         match action {
             LogAction::Read { offset } => {
-                Self::read_entry(me, backer, *file_size, offset)
-                    .await
-                    .map(|entry| LogResult::Read { entry })
+                Self::read_entry(me, backer, *file_size, offset).await.map(
+                    |(entry, end_offset)| LogResult::Read { entry, end_offset },
+                )
             }
             LogAction::Write {
                 entry,
@@ -543,40 +546,45 @@ mod storage_tests {
         let mut backer_file =
             prepare_test_file("/tmp/test-backer-2.log").await?;
         let entry = TestEntry("test-entry-dummy-string".into());
-        let now_size =
+        let mid_size =
             StorageHub::append_entry(0, &mut backer_file, 0, &entry, false)
                 .await?;
-        let now_size = StorageHub::append_entry(
+        let end_size = StorageHub::append_entry(
             0,
             &mut backer_file,
-            now_size,
+            mid_size,
             &entry,
             true,
         )
         .await?;
         assert_eq!(
-            StorageHub::read_entry(0, &mut backer_file, now_size, 0).await?,
-            Some(TestEntry("test-entry-dummy-string".into()))
+            StorageHub::read_entry(0, &mut backer_file, end_size, mid_size)
+                .await?,
+            (Some(TestEntry("test-entry-dummy-string".into())), end_size)
+        );
+        assert_eq!(
+            StorageHub::read_entry(0, &mut backer_file, end_size, 0).await?,
+            (Some(TestEntry("test-entry-dummy-string".into())), mid_size)
         );
         assert_eq!(
             StorageHub::<TestEntry>::read_entry(
                 0,
                 &mut backer_file,
-                now_size,
-                now_size + 10
+                end_size,
+                mid_size + 10
             )
             .await?,
-            None
+            (None, mid_size + 10)
         );
         assert_eq!(
             StorageHub::<TestEntry>::read_entry(
                 0,
                 &mut backer_file,
-                now_size,
-                now_size - 4
+                mid_size,
+                mid_size - 4
             )
             .await?,
-            None
+            (None, mid_size - 4)
         );
         Ok(())
     }
@@ -703,7 +711,8 @@ mod storage_tests {
             (
                 1,
                 LogResult::Read {
-                    entry: Some(TestEntry("abcdefgh".into()))
+                    entry: Some(TestEntry("abcdefgh".into())),
+                    end_offset: 8 + entry_bytes.len(),
                 }
             )
         );

From d3d607db36102dcfcda26ae50638109fc0f0ec50 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Fri, 1 Sep 2023 18:38:42 +0800
Subject: [PATCH 33/89] fixing bench script hanging issue

---
 scripts/local_bench.tmp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index 671b5497..ee007de3 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -6,8 +6,8 @@
 
 PERF_STORAGE_ALPHA = 0
 PERF_STORAGE_BETA = 0
-PERF_NETWORK_ALPHA = 10000
-PERF_NETWORK_BETA = 100
+PERF_NETWORK_ALPHA = 1000
+PERF_NETWORK_BETA = 10
 
 
 def do_cargo_build():

From 9e711dd334de549938cfca06c16270581a8fc7a3 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 7 Sep 2023 14:09:16 +0800
Subject: [PATCH 34/89] add crash restart test

---
 summerset_client/src/clients/repl.rs   | 72 +++++++++++++++++++-------
 summerset_client/src/clients/tester.rs | 40 +++++++++-----
 2 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/summerset_client/src/clients/repl.rs b/summerset_client/src/clients/repl.rs
index c3f29a1a..0d3b60c0 100644
--- a/summerset_client/src/clients/repl.rs
+++ b/summerset_client/src/clients/repl.rs
@@ -15,6 +15,24 @@ use summerset::{
 /// Prompt string at the start of line.
 const PROMPT: &str = ">>>>> ";
 
+/// Recognizable command types.
+enum ReplCommand {
+    /// Normal state machine replication command.
+    Normal(Command),
+
+    /// Reconnect to the service.
+    Reconnect,
+
+    /// Print help message.
+    PrintHelp,
+
+    /// Client exit.
+    Exit,
+
+    /// Nothing read.
+    Nothing,
+}
+
 /// Interactive REPL-style client struct.
 pub struct ClientRepl {
     /// Closed-loop request driver.
@@ -47,6 +65,7 @@ impl ClientRepl {
         println!("HELP: Supported commands are:");
         println!("        get <key>");
         println!("        put <key> <value>");
+        println!("        reconnect");
         println!("        help");
         println!("        exit");
         println!(
@@ -56,17 +75,16 @@ impl ClientRepl {
     }
 
     /// Reads in user input and parses into a command.
-    fn read_command(&mut self) -> Result<Option<Command>, SummersetError> {
+    fn read_command(&mut self) -> Result<ReplCommand, SummersetError> {
         self.input_buf.clear();
         let nread = io::stdin().read_line(&mut self.input_buf)?;
         if nread == 0 {
-            println!("Exitting...");
-            return Ok(None);
+            return Ok(ReplCommand::Exit);
         }
 
         let line: &str = self.input_buf.trim();
         if line.is_empty() {
-            return Err(SummersetError("".into()));
+            return Ok(ReplCommand::Nothing);
         }
 
         // split input line by whitespaces, getting an iterator of segments
@@ -86,7 +104,7 @@ impl ClientRepl {
                 }
 
                 // keys and values are kept as-is, no case conversions
-                Ok(Some(Command::Get {
+                Ok(ReplCommand::Normal(Command::Get {
                     key: key.unwrap().into(),
                 }))
             }
@@ -105,21 +123,17 @@ impl ClientRepl {
                     return Err(err);
                 }
 
-                Ok(Some(Command::Put {
+                Ok(ReplCommand::Normal(Command::Put {
                     key: key.unwrap().into(),
                     value: value.unwrap().into(),
                 }))
             }
 
-            "help" => {
-                self.print_help(None);
-                Err(SummersetError("".into()))
-            }
+            "help" => Ok(ReplCommand::PrintHelp),
 
-            "exit" => {
-                println!("Exitting...");
-                Ok(None)
-            }
+            "reconnect" => Ok(ReplCommand::Reconnect),
+
+            "exit" => Ok(ReplCommand::Exit),
 
             _ => {
                 let err = SummersetError(format!(
@@ -174,14 +188,32 @@ impl ClientRepl {
         self.print_prompt();
 
         let cmd = self.read_command()?;
-        if cmd.is_none() {
-            return Ok(false);
-        }
+        match cmd {
+            ReplCommand::Exit => {
+                println!("Exitting...");
+                Ok(false)
+            }
+
+            ReplCommand::Nothing => Ok(true),
 
-        let result = self.eval_command(cmd.unwrap()).await?;
+            ReplCommand::Reconnect => {
+                println!("Reconnecting...");
+                self.driver.leave(false).await?;
+                self.driver.connect().await?;
+                Ok(true)
+            }
 
-        self.print_result(result);
-        Ok(true)
+            ReplCommand::PrintHelp => {
+                self.print_help(None);
+                Ok(true)
+            }
+
+            ReplCommand::Normal(cmd) => {
+                let result = self.eval_command(cmd).await?;
+                self.print_result(result);
+                Ok(true)
+            }
+        }
     }
 
     /// Runs the infinite REPL loop.
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index 9ad2c3d6..b097256e 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -18,8 +18,8 @@ use serde::Deserialize;
 use tokio::time::Duration;
 
 use summerset::{
-    GenericEndpoint, CommandResult, RequestId, CtrlRequest, CtrlReply,
-    SummersetError, pf_error, logged_err, parsed_config,
+    ReplicaId, GenericEndpoint, CommandResult, RequestId, CtrlRequest,
+    CtrlReply, SummersetError, pf_error, logged_err, parsed_config,
 };
 
 lazy_static! {
@@ -215,15 +215,16 @@ impl ClientTester {
         }
     }
 
-    /// Resets all servers in the cluster to initial empty state.
-    async fn reset_cluster(&mut self) -> Result<(), SummersetError> {
+    /// Resets some server(s) in the cluster.
+    async fn reset_server(
+        &mut self,
+        server: Option<ReplicaId>,
+        durable: bool,
+    ) -> Result<(), SummersetError> {
         let ctrl_stub = self.driver.ctrl_stub();
 
         // send ResetServer request to manager
-        let req = CtrlRequest::ResetServer {
-            server: None,
-            durable: false,
-        };
+        let req = CtrlRequest::ResetServer { server, durable };
         let mut sent = ctrl_stub.send_req(Some(&req))?;
         while !sent {
             sent = ctrl_stub.send_req(None)?;
@@ -243,7 +244,7 @@ impl ClientTester {
         name: &str,
     ) -> Result<(), SummersetError> {
         // reset everything to initial state at the start of each test
-        self.reset_cluster().await?;
+        self.reset_server(None, false).await?;
         self.driver.connect().await?;
         self.cached_replies.clear();
 
@@ -330,18 +331,31 @@ impl ClientTester {
 
     /// Client leaves and reconnects.
     async fn test_reconnect(&mut self) -> Result<(), SummersetError> {
-        let v0 = Self::gen_rand_string(8);
-        let mut req_id = self.issue_put("Jose", &v0)?;
+        let v = Self::gen_rand_string(8);
+        let mut req_id = self.issue_put("Jose", &v)?;
         self.expect_put_reply(req_id, Some(None), 1).await?;
         self.driver.leave(false).await?;
         self.driver.connect().await?;
         req_id = self.issue_get("Jose")?;
-        self.expect_get_reply(req_id, Some(Some(&v0)), 1).await?;
+        self.expect_get_reply(req_id, Some(Some(&v)), 1).await?;
         Ok(())
     }
 
     /// Replica node crashes and restarts.
     async fn test_crash_restart(&mut self) -> Result<(), SummersetError> {
-        todo!("TODO")
+        let v = Self::gen_rand_string(8);
+        let mut req_id = self.issue_put("Jose", &v)?;
+        self.expect_put_reply(req_id, Some(None), 1).await?;
+        self.driver.leave(false).await?;
+        self.reset_server(Some(1), true).await?;
+        self.driver.connect().await?;
+        req_id = self.issue_get("Jose")?;
+        self.expect_get_reply(req_id, Some(Some(&v)), 1).await?;
+        self.driver.leave(false).await?;
+        self.reset_server(Some(0), true).await?;
+        self.driver.connect().await?;
+        req_id = self.issue_get("Jose")?;
+        self.expect_get_reply(req_id, Some(Some(&v)), 1).await?;
+        Ok(())
     }
 }

From f69c8254d7aa8162e4128777af685d65fd3f508a Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 7 Sep 2023 14:25:05 +0800
Subject: [PATCH 35/89] add log recovery logic to SimplePush

---
 src/protocols/simple_push.rs | 79 ++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index 9deb6775..dcd5204e 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -72,6 +72,7 @@ enum LogEntry {
     },
     PeerPushed {
         peer: ReplicaId,
+        src_inst_idx: usize,
         reqs: Vec<(ClientId, ApiRequest)>,
     },
 }
@@ -285,6 +286,7 @@ impl SimplePushReplica {
         // submit log action to make this instance durable
         let log_entry = LogEntry::PeerPushed {
             peer,
+            src_inst_idx,
             reqs: req_batch.clone(),
         };
         self.storage_hub.submit_action(
@@ -450,6 +452,80 @@ impl SimplePushReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+
+    /// Recover state from durable storage log.
+    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.log_offset, 0);
+        loop {
+            // using 0 as a special log action ID
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Read {
+                    offset: self.log_offset,
+                },
+            )?;
+            let (_, log_result) = self.storage_hub.get_result().await?;
+
+            match log_result {
+                LogResult::Read {
+                    entry: Some(entry),
+                    end_offset,
+                } => {
+                    let (from_peer, reqs) = match entry {
+                        LogEntry::FromClient { reqs } => (None, reqs),
+                        LogEntry::PeerPushed {
+                            peer,
+                            src_inst_idx,
+                            reqs,
+                        } => (Some((peer, src_inst_idx)), reqs),
+                    };
+                    // execute all commands on state machine synchronously
+                    for (_, req) in reqs.clone() {
+                        if let ApiRequest::Req { cmd, .. } = req {
+                            // using 0 as a special command ID
+                            self.state_machine.submit_cmd(0, cmd)?;
+                            let _ = self.state_machine.get_result().await?;
+                        }
+                    }
+                    // rebuild in-memory log
+                    let num_reqs = reqs.len();
+                    self.insts.push(Instance {
+                        reqs,
+                        durable: true,
+                        pending_peers: Bitmap::new(self.population, false),
+                        execed: vec![true; num_reqs],
+                        from_peer,
+                    });
+                    // update log offset
+                    self.log_offset = end_offset;
+                }
+                LogResult::Read { entry: None, .. } => {
+                    // end of log reached
+                    break;
+                }
+                _ => {
+                    return logged_err!(self.id; "unexpected log result type");
+                }
+            }
+        }
+
+        // do an extra Truncate to remove paritial entry at the end if any
+        self.storage_hub.submit_action(
+            0,
+            LogAction::Truncate {
+                offset: self.log_offset,
+            },
+        )?;
+        let (_, log_result) = self.storage_hub.get_result().await?;
+        if let LogResult::Truncate {
+            offset_ok: true, ..
+        } = log_result
+        {
+            Ok(())
+        } else {
+            logged_err!(self.id; "unexpected log result type")
+        }
+    }
 }
 
 #[async_trait]
@@ -560,6 +636,9 @@ impl GenericReplica for SimplePushReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
+        // recover state from durable storage log
+        self.recover_from_log().await?;
+
         // main event loop
         loop {
             tokio::select! {

From e24e464bc826ef93391c42fb7a581e29b6ef3cb9 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 7 Sep 2023 20:16:00 +0800
Subject: [PATCH 36/89] add log recovery logic to Paxos variants

---
 src/protocols/crossword.rs                | 163 ++++++++++++++++++++++
 src/protocols/multipaxos.rs               | 142 +++++++++++++++++++
 src/protocols/rep_nothing.rs              |   2 +-
 src/protocols/rs_paxos.rs                 | 163 ++++++++++++++++++++++
 src/protocols/simple_push.rs              |   2 +-
 summerset_client/src/clients/tester.rs    |   8 +-
 summerset_client/src/drivers/open_loop.rs |   7 +-
 7 files changed, 479 insertions(+), 8 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 012007b4..9ef9b2b5 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -1168,6 +1168,166 @@ impl CrosswordReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+
+    /// Apply a durable storage log entry for recovery.
+    async fn recover_apply_entry(
+        &mut self,
+        entry: LogEntry,
+    ) -> Result<(), SummersetError> {
+        match entry {
+            LogEntry::PrepareBal { slot, ballot } => {
+                // locate instance in memory, filling in null instances if needed
+                while self.insts.len() <= slot {
+                    self.insts.push(Instance {
+                        bal: 0,
+                        status: Status::Null,
+                        reqs_cw: RSCodeword::<ReqBatch>::from_null(
+                            self.quorum_cnt,
+                            self.population - self.quorum_cnt,
+                        )?,
+                        leader_bk: None,
+                        replica_bk: None,
+                    });
+                }
+                // update instance state
+                let inst = &mut self.insts[slot];
+                inst.bal = ballot;
+                inst.status = Status::Preparing;
+                // update bal_prep_sent and bal_max_seen, reset bal_prepared
+                if self.bal_prep_sent < ballot {
+                    self.bal_prep_sent = ballot;
+                }
+                if self.bal_max_seen < ballot {
+                    self.bal_max_seen = ballot;
+                }
+                self.bal_prepared = 0;
+            }
+
+            LogEntry::AcceptData {
+                slot,
+                ballot,
+                reqs_cw,
+            } => {
+                // locate instance in memory, filling in null instances if needed
+                while self.insts.len() <= slot {
+                    self.insts.push(Instance {
+                        bal: 0,
+                        status: Status::Null,
+                        reqs_cw: RSCodeword::<ReqBatch>::from_null(
+                            self.quorum_cnt,
+                            self.population - self.quorum_cnt,
+                        )?,
+                        leader_bk: None,
+                        replica_bk: None,
+                    });
+                }
+                // update instance state
+                let inst = &mut self.insts[slot];
+                inst.bal = ballot;
+                inst.status = Status::Accepting;
+                inst.reqs_cw = reqs_cw;
+                // update bal_prepared and bal_max_seen
+                if self.bal_prepared < ballot {
+                    self.bal_prepared = ballot;
+                }
+                if self.bal_max_seen < ballot {
+                    self.bal_max_seen = ballot;
+                }
+                assert!(self.bal_prepared <= self.bal_prep_sent);
+            }
+
+            LogEntry::CommitSlot { slot } => {
+                assert!(slot < self.insts.len());
+                // update instance state
+                self.insts[slot].status = Status::Committed;
+                // submit commands in contiguously committed instance to the
+                // state machine
+                if slot == self.commit_bar {
+                    while self.commit_bar < self.insts.len() {
+                        let inst = &mut self.insts[self.commit_bar];
+                        if inst.status < Status::Committed {
+                            break;
+                        }
+                        // check number of available shards
+                        if inst.reqs_cw.avail_shards() < self.quorum_cnt {
+                            // can't execute if I don't have the complete request batch
+                            break;
+                        } else if inst.reqs_cw.avail_data_shards()
+                            < self.quorum_cnt
+                        {
+                            // have enough shards but need reconstruction
+                            inst.reqs_cw
+                                .reconstruct_data(Some(&self.rs_coder))?;
+                        }
+                        // execute all commands in this instance on state machine
+                        // synchronously
+                        for (_, req) in inst.reqs_cw.get_data()?.clone() {
+                            if let ApiRequest::Req { cmd, .. } = req {
+                                // using 0 as a special command ID
+                                self.state_machine.submit_cmd(0, cmd)?;
+                                let _ = self.state_machine.get_result().await?;
+                            }
+                        }
+                        // update commit_bar and exec_bar
+                        self.commit_bar += 1;
+                        self.exec_bar += 1;
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Recover state from durable storage log.
+    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.log_offset, 0);
+        loop {
+            // using 0 as a special log action ID
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Read {
+                    offset: self.log_offset,
+                },
+            )?;
+            let (_, log_result) = self.storage_hub.get_result().await?;
+
+            match log_result {
+                LogResult::Read {
+                    entry: Some(entry),
+                    end_offset,
+                } => {
+                    self.recover_apply_entry(entry).await?;
+                    // update log offset
+                    self.log_offset = end_offset;
+                }
+                LogResult::Read { entry: None, .. } => {
+                    // end of log reached
+                    break;
+                }
+                _ => {
+                    return logged_err!(self.id; "unexpected log result type");
+                }
+            }
+        }
+
+        // do an extra Truncate to remove paritial entry at the end if any
+        self.storage_hub.submit_action(
+            0,
+            LogAction::Truncate {
+                offset: self.log_offset,
+            },
+        )?;
+        let (_, log_result) = self.storage_hub.get_result().await?;
+        if let LogResult::Truncate {
+            offset_ok: true, ..
+        } = log_result
+        {
+            Ok(())
+        } else {
+            logged_err!(self.id; "unexpected log result type")
+        }
+    }
 }
 
 #[async_trait]
@@ -1305,6 +1465,9 @@ impl GenericReplica for CrosswordReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
+        // recover state from durable storage log
+        self.recover_from_log().await?;
+
         // TODO: proper leader election
         if self.id == 0 {
             self.is_leader = true;
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index a0d06a19..5ec372b9 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -983,6 +983,145 @@ impl MultiPaxosReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+
+    /// Apply a durable storage log entry for recovery.
+    async fn recover_apply_entry(
+        &mut self,
+        entry: LogEntry,
+    ) -> Result<(), SummersetError> {
+        match entry {
+            LogEntry::PrepareBal { slot, ballot } => {
+                // locate instance in memory, filling in null instances if needed
+                while self.insts.len() <= slot {
+                    self.insts.push(Instance {
+                        bal: 0,
+                        status: Status::Null,
+                        reqs: Vec::new(),
+                        leader_bk: None,
+                        replica_bk: None,
+                    });
+                }
+                // update instance state
+                let inst = &mut self.insts[slot];
+                inst.bal = ballot;
+                inst.status = Status::Preparing;
+                // update bal_prep_sent and bal_max_seen, reset bal_prepared
+                if self.bal_prep_sent < ballot {
+                    self.bal_prep_sent = ballot;
+                }
+                if self.bal_max_seen < ballot {
+                    self.bal_max_seen = ballot;
+                }
+                self.bal_prepared = 0;
+            }
+
+            LogEntry::AcceptData { slot, ballot, reqs } => {
+                // locate instance in memory, filling in null instances if needed
+                while self.insts.len() <= slot {
+                    self.insts.push(Instance {
+                        bal: 0,
+                        status: Status::Null,
+                        reqs: Vec::new(),
+                        leader_bk: None,
+                        replica_bk: None,
+                    });
+                }
+                // update instance state
+                let inst = &mut self.insts[slot];
+                inst.bal = ballot;
+                inst.status = Status::Accepting;
+                inst.reqs = reqs;
+                // update bal_prepared and bal_max_seen
+                if self.bal_prepared < ballot {
+                    self.bal_prepared = ballot;
+                }
+                if self.bal_max_seen < ballot {
+                    self.bal_max_seen = ballot;
+                }
+                assert!(self.bal_prepared <= self.bal_prep_sent);
+            }
+
+            LogEntry::CommitSlot { slot } => {
+                assert!(slot < self.insts.len());
+                // update instance state
+                self.insts[slot].status = Status::Committed;
+                // submit commands in contiguously committed instance to the
+                // state machine
+                if slot == self.commit_bar {
+                    while self.commit_bar < self.insts.len() {
+                        let inst = &mut self.insts[self.commit_bar];
+                        if inst.status < Status::Committed {
+                            break;
+                        }
+                        // execute all commands in this instance on state machine
+                        // synchronously
+                        for (_, req) in inst.reqs.clone() {
+                            if let ApiRequest::Req { cmd, .. } = req {
+                                // using 0 as a special command ID
+                                self.state_machine.submit_cmd(0, cmd)?;
+                                let _ = self.state_machine.get_result().await?;
+                            }
+                        }
+                        // update commit_bar and exec_bar
+                        self.commit_bar += 1;
+                        self.exec_bar += 1;
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Recover state from durable storage log.
+    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.log_offset, 0);
+        loop {
+            // using 0 as a special log action ID
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Read {
+                    offset: self.log_offset,
+                },
+            )?;
+            let (_, log_result) = self.storage_hub.get_result().await?;
+
+            match log_result {
+                LogResult::Read {
+                    entry: Some(entry),
+                    end_offset,
+                } => {
+                    self.recover_apply_entry(entry).await?;
+                    // update log offset
+                    self.log_offset = end_offset;
+                }
+                LogResult::Read { entry: None, .. } => {
+                    // end of log reached
+                    break;
+                }
+                _ => {
+                    return logged_err!(self.id; "unexpected log result type");
+                }
+            }
+        }
+
+        // do an extra Truncate to remove paritial entry at the end if any
+        self.storage_hub.submit_action(
+            0,
+            LogAction::Truncate {
+                offset: self.log_offset,
+            },
+        )?;
+        let (_, log_result) = self.storage_hub.get_result().await?;
+        if let LogResult::Truncate {
+            offset_ok: true, ..
+        } = log_result
+        {
+            Ok(())
+        } else {
+            logged_err!(self.id; "unexpected log result type")
+        }
+    }
 }
 
 #[async_trait]
@@ -1100,6 +1239,9 @@ impl GenericReplica for MultiPaxosReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
+        // recover state from durable storage log
+        self.recover_from_log().await?;
+
         // TODO: proper leader election
         if self.id == 0 {
             self.is_leader = true;
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index 849e3e05..6475de8d 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -314,7 +314,7 @@ impl RepNothingReplica {
                             let _ = self.state_machine.get_result().await?;
                         }
                     }
-                    // rebuild in-memory log
+                    // rebuild in-memory log entry
                     let num_reqs = entry.reqs.len();
                     self.insts.push(Instance {
                         reqs: entry.reqs,
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 8dbbafb8..ed66f6ea 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -1068,6 +1068,166 @@ impl RSPaxosReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+
+    /// Apply a durable storage log entry for recovery.
+    async fn recover_apply_entry(
+        &mut self,
+        entry: LogEntry,
+    ) -> Result<(), SummersetError> {
+        match entry {
+            LogEntry::PrepareBal { slot, ballot } => {
+                // locate instance in memory, filling in null instances if needed
+                while self.insts.len() <= slot {
+                    self.insts.push(Instance {
+                        bal: 0,
+                        status: Status::Null,
+                        reqs_cw: RSCodeword::<ReqBatch>::from_null(
+                            self.quorum_cnt,
+                            self.population - self.quorum_cnt,
+                        )?,
+                        leader_bk: None,
+                        replica_bk: None,
+                    });
+                }
+                // update instance state
+                let inst = &mut self.insts[slot];
+                inst.bal = ballot;
+                inst.status = Status::Preparing;
+                // update bal_prep_sent and bal_max_seen, reset bal_prepared
+                if self.bal_prep_sent < ballot {
+                    self.bal_prep_sent = ballot;
+                }
+                if self.bal_max_seen < ballot {
+                    self.bal_max_seen = ballot;
+                }
+                self.bal_prepared = 0;
+            }
+
+            LogEntry::AcceptData {
+                slot,
+                ballot,
+                reqs_cw,
+            } => {
+                // locate instance in memory, filling in null instances if needed
+                while self.insts.len() <= slot {
+                    self.insts.push(Instance {
+                        bal: 0,
+                        status: Status::Null,
+                        reqs_cw: RSCodeword::<ReqBatch>::from_null(
+                            self.quorum_cnt,
+                            self.population - self.quorum_cnt,
+                        )?,
+                        leader_bk: None,
+                        replica_bk: None,
+                    });
+                }
+                // update instance state
+                let inst = &mut self.insts[slot];
+                inst.bal = ballot;
+                inst.status = Status::Accepting;
+                inst.reqs_cw = reqs_cw;
+                // update bal_prepared and bal_max_seen
+                if self.bal_prepared < ballot {
+                    self.bal_prepared = ballot;
+                }
+                if self.bal_max_seen < ballot {
+                    self.bal_max_seen = ballot;
+                }
+                assert!(self.bal_prepared <= self.bal_prep_sent);
+            }
+
+            LogEntry::CommitSlot { slot } => {
+                assert!(slot < self.insts.len());
+                // update instance state
+                self.insts[slot].status = Status::Committed;
+                // submit commands in contiguously committed instance to the
+                // state machine
+                if slot == self.commit_bar {
+                    while self.commit_bar < self.insts.len() {
+                        let inst = &mut self.insts[self.commit_bar];
+                        if inst.status < Status::Committed {
+                            break;
+                        }
+                        // check number of available shards
+                        if inst.reqs_cw.avail_shards() < self.quorum_cnt {
+                            // can't execute if I don't have the complete request batch
+                            break;
+                        } else if inst.reqs_cw.avail_data_shards()
+                            < self.quorum_cnt
+                        {
+                            // have enough shards but need reconstruction
+                            inst.reqs_cw
+                                .reconstruct_data(Some(&self.rs_coder))?;
+                        }
+                        // execute all commands in this instance on state machine
+                        // synchronously
+                        for (_, req) in inst.reqs_cw.get_data()?.clone() {
+                            if let ApiRequest::Req { cmd, .. } = req {
+                                // using 0 as a special command ID
+                                self.state_machine.submit_cmd(0, cmd)?;
+                                let _ = self.state_machine.get_result().await?;
+                            }
+                        }
+                        // update commit_bar and exec_bar
+                        self.commit_bar += 1;
+                        self.exec_bar += 1;
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Recover state from durable storage log.
+    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.log_offset, 0);
+        loop {
+            // using 0 as a special log action ID
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Read {
+                    offset: self.log_offset,
+                },
+            )?;
+            let (_, log_result) = self.storage_hub.get_result().await?;
+
+            match log_result {
+                LogResult::Read {
+                    entry: Some(entry),
+                    end_offset,
+                } => {
+                    self.recover_apply_entry(entry).await?;
+                    // update log offset
+                    self.log_offset = end_offset;
+                }
+                LogResult::Read { entry: None, .. } => {
+                    // end of log reached
+                    break;
+                }
+                _ => {
+                    return logged_err!(self.id; "unexpected log result type");
+                }
+            }
+        }
+
+        // do an extra Truncate to remove paritial entry at the end if any
+        self.storage_hub.submit_action(
+            0,
+            LogAction::Truncate {
+                offset: self.log_offset,
+            },
+        )?;
+        let (_, log_result) = self.storage_hub.get_result().await?;
+        if let LogResult::Truncate {
+            offset_ok: true, ..
+        } = log_result
+        {
+            Ok(())
+        } else {
+            logged_err!(self.id; "unexpected log result type")
+        }
+    }
 }
 
 #[async_trait]
@@ -1198,6 +1358,9 @@ impl GenericReplica for RSPaxosReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
+        // recover state from durable storage log
+        self.recover_from_log().await?;
+
         // TODO: proper leader election
         if self.id == 0 {
             self.is_leader = true;
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index dcd5204e..7260bc27 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -487,7 +487,7 @@ impl SimplePushReplica {
                             let _ = self.state_machine.get_result().await?;
                         }
                     }
-                    // rebuild in-memory log
+                    // rebuild in-memory log entry
                     let num_reqs = reqs.len();
                     self.insts.push(Instance {
                         reqs,
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index b097256e..b0246688 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -255,15 +255,15 @@ impl ClientTester {
             _ => return logged_err!("c"; "unrecognized test name '{}'", name),
         };
 
-        // send leave notification and forget about the TCP connections at the
-        // end of each test
-        self.driver.leave(false).await?;
-
         if let Err(ref e) = result {
             cprintln!("{:>16} | <red>{:^6}</> | {}", name, "FAIL", e);
         } else {
             cprintln!("{:>16} | <green>{:^6}</> | --", name, "PASS");
         }
+
+        // send leave notification and forget about the TCP connections at the
+        // end of each test
+        self.driver.leave(false).await?;
         result
     }
 
diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs
index 433f68a2..057d414f 100644
--- a/summerset_client/src/drivers/open_loop.rs
+++ b/summerset_client/src/drivers/open_loop.rs
@@ -11,8 +11,8 @@ use tokio::time::{Duration, Instant};
 
 use summerset::{
     GenericEndpoint, ClientId, Command, CommandResult, ApiRequest, ApiReply,
-    RequestId, ClientCtrlStub, Timer, SummersetError, pf_debug, pf_error,
-    logged_err,
+    RequestId, ClientCtrlStub, Timer, SummersetError, pf_trace, pf_debug,
+    pf_error, logged_err,
 };
 
 /// Open-loop driver struct.
@@ -68,9 +68,12 @@ impl DriverOpenLoop {
     ) -> Result<(), SummersetError> {
         // loop until all pending replies have been received
         while self.should_retry {
+            pf_trace!(self.id; "retrying last issue at leave");
             self.issue_retry()?;
         }
         while !self.pending_reqs.is_empty() {
+            pf_trace!(self.id; "pending {} requests at leave",
+                               self.pending_reqs.len());
             self.wait_reply().await?;
         }
 

From f036df5f05a4d5c782124d17d826ece6a16a0167 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Tue, 12 Sep 2023 08:46:09 -0500
Subject: [PATCH 37/89] minor changes to ResetServers control message

---
 src/manager/clusman.rs                 | 16 +++---
 src/manager/reactor.rs                 |  8 +--
 src/protocols/multipaxos.rs            |  1 -
 src/utils/bitmap.rs                    |  2 +-
 summerset_client/src/clients/tester.rs | 67 ++++++++++++++++++--------
 5 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index de18afb5..2a523972 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -244,19 +244,19 @@ impl ClusterManager {
             .send_reply(CtrlReply::QueryInfo { servers }, client)
     }
 
-    /// Handler of client ResetServer request.
-    async fn handle_client_reset_server(
+    /// Handler of client ResetServers request.
+    async fn handle_client_reset_servers(
         &mut self,
         client: ClientId,
-        server: Option<ReplicaId>,
+        servers: HashSet<ReplicaId>,
         durable: bool,
     ) -> Result<(), SummersetError> {
         let num_replicas = self.server_info.len();
-        let mut servers: Vec<ReplicaId> = if server.is_none() {
+        let mut servers: Vec<ReplicaId> = if servers.is_empty() {
             // all active servers
             self.server_info.keys().copied().collect()
         } else {
-            vec![server.unwrap()]
+            servers.into_iter().collect()
         };
 
         // reset specified server(s)
@@ -291,7 +291,7 @@ impl ClusterManager {
         }
 
         self.client_reactor.send_reply(
-            CtrlReply::ResetServer {
+            CtrlReply::ResetServers {
                 servers: reset_done,
             },
             client,
@@ -310,8 +310,8 @@ impl ClusterManager {
                 self.handle_client_query_info(client)?;
             }
 
-            CtrlRequest::ResetServer { server, durable } => {
-                self.handle_client_reset_server(client, server, durable)
+            CtrlRequest::ResetServers { servers, durable } => {
+                self.handle_client_reset_servers(client, servers, durable)
                     .await?;
             }
 
diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs
index 41a0582d..e1e388c3 100644
--- a/src/manager/reactor.rs
+++ b/src/manager/reactor.rs
@@ -27,9 +27,9 @@ pub enum CtrlRequest {
     QueryInfo,
 
     /// Reset the specified server(s) to initial state.
-    ResetServer {
-        /// ID of server to reset. If `None`, resets all active servers.
-        server: Option<ReplicaId>,
+    ResetServers {
+        /// IDs of servers to reset. If empty, resets all active servers.
+        servers: HashSet<ReplicaId>,
         /// If false, cleans durable storage state as well.
         durable: bool,
     },
@@ -47,7 +47,7 @@ pub enum CtrlReply {
     },
 
     /// Reply to server reset request.
-    ResetServer { servers: HashSet<ReplicaId> },
+    ResetServers { servers: HashSet<ReplicaId> },
 
     /// Reply to client leave notification.
     Leave,
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 5ec372b9..0ebf8564 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -314,7 +314,6 @@ impl MultiPaxosReplica {
 
         // create a new instance in the first null slot (or append a new one
         // at the end if no holes exist)
-        // TODO: maybe use a null_idx variable to better keep track of this
         let mut slot = self.insts.len();
         for s in self.commit_bar..self.insts.len() {
             let old_inst = &mut self.insts[s];
diff --git a/src/utils/bitmap.rs b/src/utils/bitmap.rs
index dfbb8467..d5fe9c8e 100644
--- a/src/utils/bitmap.rs
+++ b/src/utils/bitmap.rs
@@ -155,7 +155,7 @@ mod bitmap_tests {
 
     #[test]
     fn bitmap_iter() {
-        let ref_map = vec![true, true, false, true, true];
+        let ref_map = [true, true, false, true, true];
         let mut map = Bitmap::new(5, true);
         assert!(map.set(2, false).is_ok());
         for (id, flag) in map.iter() {
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index b0246688..94528700 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -1,6 +1,6 @@
 //! Correctness testing client using open-loop driver.
 
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 
 use crate::drivers::DriverOpenLoop;
 
@@ -25,9 +25,11 @@ use summerset::{
 lazy_static! {
     /// List of all tests. If the flag is true, the test is marked as basic.
     static ref ALL_TESTS: Vec<(&'static str, bool)> = vec![
-        ("primitives", true),
-        ("reconnect", true),
-        ("crash_restart", false),
+        ("primitive_ops", true),
+        ("client_reconnect", true),
+        ("node_1_crash", true),
+        ("node_0_crash", true),
+        ("two_nodes_crash", false)
     ];
 }
 
@@ -216,15 +218,15 @@ impl ClientTester {
     }
 
     /// Resets some server(s) in the cluster.
-    async fn reset_server(
+    async fn reset_servers(
         &mut self,
-        server: Option<ReplicaId>,
+        servers: HashSet<ReplicaId>,
         durable: bool,
     ) -> Result<(), SummersetError> {
         let ctrl_stub = self.driver.ctrl_stub();
 
         // send ResetServer request to manager
-        let req = CtrlRequest::ResetServer { server, durable };
+        let req = CtrlRequest::ResetServers { servers, durable };
         let mut sent = ctrl_stub.send_req(Some(&req))?;
         while !sent {
             sent = ctrl_stub.send_req(None)?;
@@ -233,7 +235,7 @@ impl ClientTester {
         // wait for reply from manager
         let reply = ctrl_stub.recv_reply().await?;
         match reply {
-            CtrlReply::ResetServer { .. } => Ok(()),
+            CtrlReply::ResetServers { .. } => Ok(()),
             _ => logged_err!("c"; "unexpected control reply type"),
         }
     }
@@ -244,21 +246,23 @@ impl ClientTester {
         name: &str,
     ) -> Result<(), SummersetError> {
         // reset everything to initial state at the start of each test
-        self.reset_server(None, false).await?;
+        self.reset_servers(HashSet::new(), false).await?;
         self.driver.connect().await?;
         self.cached_replies.clear();
 
         let result = match name {
-            "primitives" => self.test_primitives().await,
-            "reconnect" => self.test_reconnect().await,
-            "crash_restart" => self.test_crash_restart().await,
+            "primitive_ops" => self.test_primitive_ops().await,
+            "client_reconnect" => self.test_client_reconnect().await,
+            "node_1_crash" => self.test_node_1_crash().await,
+            "node_0_crash" => self.test_node_0_crash().await,
+            "two_nodes_crash" => self.test_two_nodes_crash().await,
             _ => return logged_err!("c"; "unrecognized test name '{}'", name),
         };
 
         if let Err(ref e) = result {
-            cprintln!("{:>16} | <red>{:^6}</> | {}", name, "FAIL", e);
+            cprintln!("{:>20} | <red>{:^6}</> | {}", name, "FAIL", e);
         } else {
-            cprintln!("{:>16} | <green>{:^6}</> | --", name, "PASS");
+            cprintln!("{:>20} | <green>{:^6}</> | --", name, "PASS");
         }
 
         // send leave notification and forget about the TCP connections at the
@@ -272,7 +276,7 @@ impl ClientTester {
         let test_name = self.params.test_name.clone();
         let mut all_pass = true;
 
-        println!("{:^16} | {:^6} | Notes", "Test Case", "Result");
+        println!("{:^20} | {:^6} | Notes", "Test Case", "Result");
         match &test_name[..] {
             "basic" => {
                 for (name, basic) in ALL_TESTS.iter() {
@@ -313,7 +317,7 @@ impl ClientTester {
 // List of tests:
 impl ClientTester {
     /// Basic primitive operations.
-    async fn test_primitives(&mut self) -> Result<(), SummersetError> {
+    async fn test_primitive_ops(&mut self) -> Result<(), SummersetError> {
         let mut req_id = self.issue_get("Jose")?;
         self.expect_get_reply(req_id, Some(None), 1).await?;
         let v0 = Self::gen_rand_string(8);
@@ -330,7 +334,7 @@ impl ClientTester {
     }
 
     /// Client leaves and reconnects.
-    async fn test_reconnect(&mut self) -> Result<(), SummersetError> {
+    async fn test_client_reconnect(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
         let mut req_id = self.issue_put("Jose", &v)?;
         self.expect_put_reply(req_id, Some(None), 1).await?;
@@ -341,18 +345,39 @@ impl ClientTester {
         Ok(())
     }
 
-    /// Replica node crashes and restarts.
-    async fn test_crash_restart(&mut self) -> Result<(), SummersetError> {
+    /// Replica node 1 crashes and restarts.
+    async fn test_node_1_crash(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
         let mut req_id = self.issue_put("Jose", &v)?;
         self.expect_put_reply(req_id, Some(None), 1).await?;
         self.driver.leave(false).await?;
-        self.reset_server(Some(1), true).await?;
+        self.reset_servers(HashSet::from([1]), true).await?;
         self.driver.connect().await?;
         req_id = self.issue_get("Jose")?;
         self.expect_get_reply(req_id, Some(Some(&v)), 1).await?;
+        Ok(())
+    }
+
+    /// Replica node 0 crashes and restarts.
+    async fn test_node_0_crash(&mut self) -> Result<(), SummersetError> {
+        let v = Self::gen_rand_string(8);
+        let mut req_id = self.issue_put("Jose", &v)?;
+        self.expect_put_reply(req_id, Some(None), 1).await?;
+        self.driver.leave(false).await?;
+        self.reset_servers(HashSet::from([0]), true).await?;
+        self.driver.connect().await?;
+        req_id = self.issue_get("Jose")?;
+        self.expect_get_reply(req_id, Some(Some(&v)), 1).await?;
+        Ok(())
+    }
+
+    /// Two replica nodes crashes and restarts.
+    async fn test_two_nodes_crash(&mut self) -> Result<(), SummersetError> {
+        let v = Self::gen_rand_string(8);
+        let mut req_id = self.issue_put("Jose", &v)?;
+        self.expect_put_reply(req_id, Some(None), 1).await?;
         self.driver.leave(false).await?;
-        self.reset_server(Some(0), true).await?;
+        self.reset_servers(HashSet::from([0, 1]), true).await?;
         self.driver.connect().await?;
         req_id = self.issue_get("Jose")?;
         self.expect_get_reply(req_id, Some(Some(&v)), 1).await?;

From 8fc6a2aa07021fbd3dc399903c33feee5932114a Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Tue, 12 Sep 2023 08:51:56 -0500
Subject: [PATCH 38/89] minor updates to README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 974767d1..16944f3a 100644
--- a/README.md
+++ b/README.md
@@ -147,7 +147,7 @@ Complete cluster management and benchmarking scripts are available in another re
 - [x] cluster manager oracle impl.
 - [x] implementation of MultiPaxos
   - [x] client-side timeout/retry logic
-  - [ ] state persistence & restart check
+  - [x] state persistence & restart check
   - [ ] automatic leader election, backoffs
   - [ ] snapshotting & garbage collection
   - [ ] specialize read-only commands?

From b6ce402086e51f1a25a08c13809adf816cfe85d1 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Tue, 12 Sep 2023 14:15:31 -0500
Subject: [PATCH 39/89] staging progress on leader timeouts

---
 src/protocols/multipaxos.rs            | 181 +++++++++++++++++++++++--
 summerset_client/src/clients/tester.rs |   4 +-
 2 files changed, 174 insertions(+), 11 deletions(-)

diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 0ebf8564..4690c08d 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -11,7 +11,7 @@ use std::collections::HashMap;
 use std::path::Path;
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, Bitmap};
+use crate::utils::{SummersetError, Bitmap, Timer};
 use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
 use crate::server::{
     ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi,
@@ -21,13 +21,15 @@ use crate::server::{
 use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint};
 use crate::protocols::SmrProtocol;
 
+use rand::prelude::*;
+
 use async_trait::async_trait;
 
 use get_size::GetSize;
 
 use serde::{Serialize, Deserialize};
 
-use tokio::time::Duration;
+use tokio::time::{self, Duration, Interval, MissedTickBehavior};
 use tokio::sync::watch;
 
 /// Configuration parameters struct.
@@ -45,6 +47,15 @@ pub struct ReplicaConfigMultiPaxos {
     /// Whether to call `fsync()`/`fdatasync()` on logger.
     pub logger_sync: bool,
 
+    /// Min timeout of not hearing any heartbeat from leader in millisecs.
+    pub hb_hear_timeout_min: u64,
+
+    /// Max timeout of not hearing any heartbeat from leader in millisecs.
+    pub hb_hear_timeout_max: u64,
+
+    /// Interval of leader sending heartbeats to followers.
+    pub hb_send_interval_ms: u64,
+
     // Performance simulation params (all zeros means no perf simulation):
     pub perf_storage_a: u64,
     pub perf_storage_b: u64,
@@ -60,6 +71,9 @@ impl Default for ReplicaConfigMultiPaxos {
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.multipaxos.wal".into(),
             logger_sync: false,
+            hb_hear_timeout_min: 300,
+            hb_hear_timeout_max: 600,
+            hb_send_interval_ms: 50,
             perf_storage_a: 0,
             perf_storage_b: 0,
             perf_network_a: 0,
@@ -169,6 +183,9 @@ enum PeerMsg {
 
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
+
+    /// Leader activity heartbeat.
+    Heartbeat { ballot: Ballot },
 }
 
 /// MultiPaxos server replica module.
@@ -206,6 +223,12 @@ pub struct MultiPaxosReplica {
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
+    /// Timer for hearing heartbeat from leader.
+    hb_hear_timer: Timer,
+
+    /// Interval for sending heartbeat to followers.
+    hb_send_interval: Interval,
+
     /// Do I think I am the leader?
     is_leader: bool,
 
@@ -344,9 +367,6 @@ impl MultiPaxosReplica {
         }
 
         // decide whether we can enter fast path for this instance
-        // TODO: remember to reset bal_prepared to 0, update bal_max_seen,
-        //       and re-handle all Preparing & Accepting instances in autonomous
-        //       Prepare initiation
         if self.bal_prepared == 0 {
             // slow case: Prepare phase not done yet. Initiate a Prepare round
             // if none is on the fly, or just wait for some Prepare reply to
@@ -869,6 +889,7 @@ impl MultiPaxosReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
+            PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot),
         }
     }
 
@@ -927,6 +948,111 @@ impl MultiPaxosReplica {
         Ok(())
     }
 
+    /// Becomes a leader, sends self-initiated Prepare messages to followers
+    /// for all in-progress instances, and starts broadcasting heartbeats.
+    fn become_a_leader(&mut self) -> Result<(), SummersetError> {
+        assert!(!self.is_leader);
+        self.is_leader = true; // this starts broadcasting heartbeats
+        pf_warn!(self.id; "becoming a leader...");
+
+        // broadcast a heartbeat right now
+        self.bcast_heartbeats()?;
+
+        // make a greater ballot number and invalidate all in-progress instances
+        self.bal_prepared = 0;
+        self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
+        self.bal_max_seen = self.bal_prep_sent;
+
+        // redo Prepare phase for all in-progress instances
+        for (slot, inst) in self.insts.iter_mut().enumerate() {
+            if inst.status < Status::Committed {
+                inst.bal = self.bal_prep_sent;
+                inst.status = Status::Preparing;
+                pf_debug!(self.id; "enter Prepare phase for slot {} bal {}",
+                                   slot, inst.bal);
+
+                // record update to largest prepare ballot
+                self.storage_hub.submit_action(
+                    Self::make_log_action_id(slot, Status::Preparing),
+                    LogAction::Append {
+                        entry: LogEntry::PrepareBal {
+                            slot,
+                            ballot: self.bal_prep_sent,
+                        },
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}",
+                                   slot, inst.bal);
+
+                // send Prepare messages to all peers
+                self.transport_hub.bcast_msg(
+                    PeerMsg::Prepare {
+                        slot,
+                        ballot: self.bal_prep_sent,
+                    },
+                    None,
+                )?;
+                pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}",
+                                   slot, inst.bal);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Broadcasts heartbeats to all replicas.
+    fn bcast_heartbeats(&mut self) -> Result<(), SummersetError> {
+        self.transport_hub.bcast_msg(
+            PeerMsg::Heartbeat {
+                ballot: self.bal_prep_sent,
+            },
+            None,
+        )?;
+        self.heard_heartbeat(self.id, self.bal_prep_sent)?;
+
+        // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
+        Ok(())
+    }
+
+    /// Chooses a random hb_hear_timeout from the min-max range and kicks off
+    /// the hb_hear_timer.
+    fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> {
+        let timeout_ms = thread_rng().gen_range(
+            self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max,
+        );
+
+        // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms);
+        self.hb_hear_timer
+            .kickoff(Duration::from_millis(timeout_ms))?;
+        Ok(())
+    }
+
+    /// Heard a heartbeat from some other replica. If the heartbeat carries a
+    /// high enough ballot number, refreshes my hearing timer and clears my
+    /// leader status if I currently think I'm a leader.
+    fn heard_heartbeat(
+        &mut self,
+        _peer: ReplicaId,
+        ballot: Ballot,
+    ) -> Result<(), SummersetError> {
+        // ignore outdated hearbeat
+        if ballot < self.bal_max_seen {
+            return Ok(());
+        }
+
+        // reset hearing timer
+        self.kickoff_hb_hear_timer()?;
+
+        // clear my leader status if it carries a higher ballot number
+        if self.is_leader && ballot > self.bal_max_seen {
+            self.is_leader = false;
+        }
+
+        // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
+        Ok(())
+    }
+
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -1140,6 +1266,8 @@ impl GenericReplica for MultiPaxosReplica {
         let config = parsed_config!(config_str => ReplicaConfigMultiPaxos;
                                     batch_interval_us, max_batch_size,
                                     backer_path, logger_sync,
+                                    hb_hear_timeout_min, hb_hear_timeout_max,
+                                    hb_send_interval_ms,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
         if config.batch_interval_us == 0 {
@@ -1149,6 +1277,27 @@ impl GenericReplica for MultiPaxosReplica {
                 config.batch_interval_us
             );
         }
+        if config.hb_hear_timeout_min < 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_min '{}'",
+                config.hb_hear_timeout_min
+            );
+        }
+        if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_max '{}'",
+                config.hb_hear_timeout_max
+            );
+        }
+        if config.hb_send_interval_ms == 0 {
+            return logged_err!(
+                id;
+                "invalid config.hb_send_interval_ms '{}'",
+                config.hb_send_interval_ms
+            );
+        }
 
         // setup state machine module
         let state_machine = StateMachine::new_and_setup(id).await?;
@@ -1211,6 +1360,10 @@ impl GenericReplica for MultiPaxosReplica {
         )
         .await?;
 
+        let mut hb_send_interval =
+            time::interval(Duration::from_millis(config.hb_send_interval_ms));
+        hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
         Ok(MultiPaxosReplica {
             id,
             population,
@@ -1223,6 +1376,8 @@ impl GenericReplica for MultiPaxosReplica {
             state_machine,
             storage_hub,
             transport_hub,
+            hb_hear_timer: Timer::new(),
+            hb_send_interval,
             is_leader: false,
             insts: vec![],
             bal_prep_sent: 0,
@@ -1241,10 +1396,8 @@ impl GenericReplica for MultiPaxosReplica {
         // recover state from durable storage log
         self.recover_from_log().await?;
 
-        // TODO: proper leader election
-        if self.id == 0 {
-            self.is_leader = true;
-        }
+        // kick off leader activity hearing timer
+        self.kickoff_hb_hear_timer()?;
 
         // main event loop
         loop {
@@ -1298,6 +1451,16 @@ impl GenericReplica for MultiPaxosReplica {
                     }
                 },
 
+                // leader inactivity timeout
+                _ = self.hb_hear_timer.timeout() => {
+                    self.become_a_leader()?;
+                },
+
+                // leader sending heartbeat
+                _ = self.hb_send_interval.tick(), if self.is_leader => {
+                    self.bcast_heartbeats()?;
+                }
+
                 // manager control message
                 ctrl_msg = self.control_hub.recv_ctrl() => {
                     if let Err(e) = ctrl_msg {
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index 94528700..e7e6092e 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -169,7 +169,7 @@ impl ClientTester {
         &mut self,
         req_id: RequestId,
         expect_value: Option<Option<&str>>,
-        // maximum number of tries if repeatedly getting `WouldBlock` failure
+        // maximum number of tries if repeatedly getting `Ok(None)` reply
         max_tries: u8,
     ) -> Result<(), SummersetError> {
         let cmd_result = self.wait_reply(req_id, max_tries).await?;
@@ -196,7 +196,7 @@ impl ClientTester {
         &mut self,
         req_id: RequestId,
         expect_old_value: Option<Option<&str>>,
-        // maximum number of tries if repeatedly getting `WouldBlock` failure
+        // maximum number of tries if repeatedly getting `Ok(None)` reply
         max_tries: u8,
     ) -> Result<(), SummersetError> {
         let cmd_result = self.wait_reply(req_id, max_tries).await?;

From dadc65c2773c1d34ea6181ff02ea53b859b54d01 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 13 Sep 2023 06:02:59 -0500
Subject: [PATCH 40/89] better client driver side API

---
 scripts/local_bench.tmp.py                  |   1 +
 summerset_client/src/clients/bench.rs       |  12 +-
 summerset_client/src/clients/repl.rs        |  73 +++--
 summerset_client/src/clients/tester.rs      | 302 ++++++++++----------
 summerset_client/src/drivers/closed_loop.rs |  66 +++--
 summerset_client/src/drivers/mod.rs         |  26 ++
 summerset_client/src/drivers/open_loop.rs   |  35 +--
 7 files changed, 286 insertions(+), 229 deletions(-)

diff --git a/scripts/local_bench.tmp.py b/scripts/local_bench.tmp.py
index ee007de3..96170e8c 100644
--- a/scripts/local_bench.tmp.py
+++ b/scripts/local_bench.tmp.py
@@ -133,6 +133,7 @@ def bench_round(
         + f"w%={put_ratio:<3d}  {length_s:3d}s"
     )
 
+    kill_all_matching("local_client.py", force=True)
     kill_all_matching("local_cluster.py", force=True)
     kill_all_matching("summerset_client", force=True)
     kill_all_matching("summerset_server", force=True)
diff --git a/summerset_client/src/clients/bench.rs b/summerset_client/src/clients/bench.rs
index c3f42cb7..6b6ea18a 100644
--- a/summerset_client/src/clients/bench.rs
+++ b/summerset_client/src/clients/bench.rs
@@ -1,6 +1,6 @@
 //! Benchmarking client using open-loop driver.
 
-use crate::drivers::DriverOpenLoop;
+use crate::drivers::{DriverReply, DriverOpenLoop};
 
 use lazy_static::lazy_static;
 
@@ -168,7 +168,6 @@ impl ClientBench {
     }
 
     /// Runs one iteration action of closed-loop style benchmark.
-    #[allow(clippy::too_many_arguments)]
     async fn closed_loop_iter(&mut self) -> Result<(), SummersetError> {
         // send next request
         let req_id = if self.retrying {
@@ -186,10 +185,10 @@ impl ClientBench {
         if self.total_cnt > self.reply_cnt {
             let result = self.driver.wait_reply().await?;
 
-            if let Some((_, _, lat)) = result {
+            if let DriverReply::Success { latency, .. } = result {
                 self.reply_cnt += 1;
                 self.chunk_cnt += 1;
-                let lat_us = lat.as_secs_f64() * 1000000.0;
+                let lat_us = latency.as_secs_f64() * 1000000.0;
                 self.chunk_lats.push(lat_us);
             }
         }
@@ -198,7 +197,6 @@ impl ClientBench {
     }
 
     /// Runs one iteration action of open-loop style benchmark.
-    #[allow(clippy::too_many_arguments)]
     async fn open_loop_iter(&mut self) -> Result<(), SummersetError> {
         tokio::select! {
             // prioritize receiving reply
@@ -206,10 +204,10 @@ impl ClientBench {
 
             // receive next reply
             result = self.driver.wait_reply() => {
-                if let Some((_, _, lat)) = result? {
+                if let DriverReply::Success { latency, .. } = result? {
                     self.reply_cnt += 1;
                     self.chunk_cnt += 1;
-                    let lat_us = lat.as_secs_f64() * 1000000.0;
+                    let lat_us = latency.as_secs_f64() * 1000000.0;
                     self.chunk_lats.push(lat_us);
 
                     if self.slowdown > 0 {
diff --git a/summerset_client/src/clients/repl.rs b/summerset_client/src/clients/repl.rs
index 0d3b60c0..09e4f330 100644
--- a/summerset_client/src/clients/repl.rs
+++ b/summerset_client/src/clients/repl.rs
@@ -2,15 +2,13 @@
 
 use std::io::{self, Write};
 
-use crate::drivers::DriverClosedLoop;
+use crate::drivers::{DriverReply, DriverClosedLoop};
 
-use color_print::cprint;
+use color_print::{cprint, cprintln};
 
 use tokio::time::Duration;
 
-use summerset::{
-    GenericEndpoint, Command, CommandResult, RequestId, SummersetError,
-};
+use summerset::{GenericEndpoint, Command, SummersetError};
 
 /// Prompt string at the start of line.
 const PROMPT: &str = ">>>>> ";
@@ -38,6 +36,9 @@ pub struct ClientRepl {
     /// Closed-loop request driver.
     driver: DriverClosedLoop,
 
+    /// Timeout duration setting.
+    timeout: Duration,
+
     /// User input buffer.
     input_buf: String,
 }
@@ -47,6 +48,7 @@ impl ClientRepl {
     pub fn new(endpoint: Box<dyn GenericEndpoint>, timeout: Duration) -> Self {
         ClientRepl {
             driver: DriverClosedLoop::new(endpoint, timeout),
+            timeout,
             input_buf: String::new(),
         }
     }
@@ -60,7 +62,7 @@ impl ClientRepl {
     /// Prints (optionally) an error message and the help message.
     fn print_help(&mut self, err: Option<&SummersetError>) {
         if let Some(e) = err {
-            println!("ERROR: {}", e);
+            cprintln!("<bright-red>✗</> {}", e);
         }
         println!("HELP: Supported commands are:");
         println!("        get <key>");
@@ -150,36 +152,51 @@ impl ClientRepl {
     async fn eval_command(
         &mut self,
         cmd: Command,
-    ) -> Result<Option<(RequestId, CommandResult, Duration)>, SummersetError>
-    {
+    ) -> Result<DriverReply, SummersetError> {
         match cmd {
-            Command::Get { key } => {
-                Ok(self.driver.get(&key).await?.map(|(req_id, value, lat)| {
-                    (req_id, CommandResult::Get { value }, lat)
-                }))
-            }
-
+            Command::Get { key } => Ok(self.driver.get(&key).await?),
             Command::Put { key, value } => {
-                Ok(self.driver.put(&key, &value).await?.map(
-                    |(req_id, old_value, lat)| {
-                        (req_id, CommandResult::Put { old_value }, lat)
-                    },
-                ))
+                Ok(self.driver.put(&key, &value).await?)
             }
         }
     }
 
     /// Prints command execution result.
-    fn print_result(
-        &mut self,
-        result: Option<(RequestId, CommandResult, Duration)>,
-    ) {
-        if let Some((req_id, cmd_result, lat)) = result {
-            let lat_ms = lat.as_secs_f64() * 1000.0;
-            println!("({}) {:?} <took {:.2} ms>", req_id, cmd_result, lat_ms);
-        } else {
-            println!("Unsuccessful: wrong leader or timeout?");
+    fn print_result(&mut self, result: DriverReply) {
+        match result {
+            DriverReply::Success {
+                req_id,
+                cmd_result,
+                latency,
+            } => {
+                let lat_ms = latency.as_secs_f64() * 1000.0;
+                cprintln!(
+                    "<bright-green>✓</> ({}) {:?} <<took {:.2} ms>>",
+                    req_id,
+                    cmd_result,
+                    lat_ms
+                );
+            }
+
+            DriverReply::Failure => {
+                cprintln!("<bright-red>✗</> service replied unknown error");
+            }
+
+            DriverReply::Redirect { server } => {
+                cprintln!(
+                    "<bright-cyan>✗</> service redirected me to server {}",
+                    server
+                );
+            }
+
+            DriverReply::Timeout => {
+                cprintln!(
+                    "<bright-red>✗</> client-side timeout {} ms",
+                    self.timeout.as_millis()
+                );
+            }
         }
+
         io::stdout().flush().unwrap();
     }
 
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index e7e6092e..04b71a5b 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -1,8 +1,8 @@
-//! Correctness testing client using open-loop driver.
+//! Correctness testing client using closed-loop driver.
 
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 
-use crate::drivers::DriverOpenLoop;
+use crate::drivers::{DriverReply, DriverClosedLoop};
 
 use color_print::cprintln;
 
@@ -15,11 +15,11 @@ use rand::distributions::Alphanumeric;
 
 use serde::Deserialize;
 
-use tokio::time::Duration;
+use tokio::time::{self, Duration};
 
 use summerset::{
-    ReplicaId, GenericEndpoint, CommandResult, RequestId, CtrlRequest,
-    CtrlReply, SummersetError, pf_error, logged_err, parsed_config,
+    ReplicaId, GenericEndpoint, CommandResult, CtrlRequest, CtrlReply,
+    SummersetError, pf_error, logged_err, parsed_config,
 };
 
 lazy_static! {
@@ -27,9 +27,8 @@ lazy_static! {
     static ref ALL_TESTS: Vec<(&'static str, bool)> = vec![
         ("primitive_ops", true),
         ("client_reconnect", true),
-        ("node_1_crash", true),
-        ("node_0_crash", true),
-        ("two_nodes_crash", false)
+        ("one_node_reset", true),
+        ("two_nodes_reset", false)
     ];
 }
 
@@ -60,14 +59,14 @@ impl Default for ModeParamsTester {
 
 /// Correctness testing client struct.
 pub struct ClientTester {
-    /// Open-loop request driver.
-    driver: DriverOpenLoop,
+    /// Closed-loop request driver.
+    driver: DriverClosedLoop,
+
+    /// Timeout duration setting.
+    timeout: Duration,
 
     /// Mode parameters struct.
     params: ModeParamsTester,
-
-    /// Replies received but not yet used.
-    cached_replies: HashMap<RequestId, CommandResult>,
 }
 
 impl ClientTester {
@@ -86,9 +85,9 @@ impl ClientTester {
         }
 
         Ok(ClientTester {
-            driver: DriverOpenLoop::new(endpoint, timeout),
+            driver: DriverClosedLoop::new(endpoint, timeout),
+            timeout,
             params,
-            cached_replies: HashMap::new(),
         })
     }
 
@@ -106,114 +105,128 @@ impl ClientTester {
         s.as_deref() == *expect
     }
 
-    /// Issues a Get request, retrying immediately on `WouldBlock` failures.
-    fn issue_get(&mut self, key: &str) -> Result<RequestId, SummersetError> {
-        let mut req_id = self.driver.issue_get(key)?;
-        while req_id.is_none() {
-            req_id = self.driver.issue_retry()?;
-        }
-        Ok(req_id.unwrap())
-    }
-
-    /// Issues a Put request, retrying immediately on `WouldBlock` failures.
-    fn issue_put(
+    /// Issues a Get request and checks its reply value against given one if
+    /// not `None`. Retries immediately upon getting redirection error.
+    async fn checked_get(
         &mut self,
         key: &str,
-        value: &str,
-    ) -> Result<RequestId, SummersetError> {
-        let mut req_id = self.driver.issue_put(key, value)?;
-        while req_id.is_none() {
-            req_id = self.driver.issue_retry()?;
-        }
-        Ok(req_id.unwrap())
-    }
+        expect_value: Option<Option<&str>>,
+    ) -> Result<(), SummersetError> {
+        loop {
+            let result = self.driver.get(key).await?;
+            match result {
+                DriverReply::Success { cmd_result, .. } => {
+                    if let CommandResult::Get { ref value } = cmd_result {
+                        if let Some(ref expect_value) = expect_value {
+                            if !Self::strings_match(value, expect_value) {
+                                return logged_err!(
+                                    self.driver.id;
+                                    "Get value mismatch: expect {:?}, got {:?}",
+                                    expect_value, value
+                                );
+                            }
+                        }
+                        return Ok(());
+                    } else {
+                        return logged_err!(
+                            self.driver.id;
+                            "CommandResult type mismatch: expect Get"
+                        );
+                    }
+                }
 
-    /// Waits for the next reply from service with the given request ID. If
-    /// non-match replies received, cache them up for future references.
-    async fn wait_reply(
-        &mut self,
-        req_id: RequestId,
-        // maximum number of tries if repeatedly getting `Ok(None)` reply
-        max_tries: u8,
-    ) -> Result<CommandResult, SummersetError> {
-        assert!(max_tries > 0);
-        let mut num_tries = 0;
-
-        // look up cached_replies first
-        if let Some(cmd_result) = self.cached_replies.remove(&req_id) {
-            return Ok(cmd_result);
-        }
+                DriverReply::Failure => {
+                    return logged_err!(
+                        self.driver.id;
+                        "service replied unknown error"
+                    );
+                }
 
-        let mut result = self.driver.wait_reply().await?;
-        while result.is_none() || result.as_ref().unwrap().0 != req_id {
-            if let Some((id, cmd_result, _)) = result {
-                self.cached_replies.insert(id, cmd_result);
-            } else {
-                num_tries += 1;
-                if num_tries == max_tries {
-                    return Err(SummersetError(format!(
-                        "exhausted {} tries expecting req {}",
-                        max_tries, req_id,
-                    )));
+                DriverReply::Redirect { .. } => {} // re-issue immediately
+
+                DriverReply::Timeout => {
+                    return logged_err!(
+                        self.driver.id;
+                        "client-side timeout {} ms",
+                        self.timeout.as_millis()
+                    )
                 }
             }
-            result = self.driver.wait_reply().await?;
         }
-
-        Ok(result.unwrap().1)
     }
 
-    /// Waits for the reply of given request ID, expecting the given Get value
-    /// if not `None`.
-    async fn expect_get_reply(
+    /// Issues a Put request and checks its reply old_value against given one
+    /// if not `None`. Retries immediately upon getting redirection error.
+    async fn checked_put(
         &mut self,
-        req_id: RequestId,
-        expect_value: Option<Option<&str>>,
-        // maximum number of tries if repeatedly getting `Ok(None)` reply
-        max_tries: u8,
+        key: &str,
+        value: &str,
+        expect_old_value: Option<Option<&str>>,
     ) -> Result<(), SummersetError> {
-        let cmd_result = self.wait_reply(req_id, max_tries).await?;
-        if let CommandResult::Get { ref value } = cmd_result {
-            if let Some(ref expect_value) = expect_value {
-                if !Self::strings_match(value, expect_value) {
-                    return Err(SummersetError(format!(
-                        "Get value mismatch: expect {:?}, got {:?}",
-                        expect_value, value
-                    )));
+        loop {
+            let result = self.driver.put(key, value).await?;
+            match result {
+                DriverReply::Success { cmd_result, .. } => {
+                    if let CommandResult::Put { ref old_value } = cmd_result {
+                        if let Some(ref expect_old_value) = expect_old_value {
+                            if !Self::strings_match(old_value, expect_old_value)
+                            {
+                                return logged_err!(
+                                    self.driver.id;
+                                    "Put old_value mismatch: expect {:?}, got {:?}",
+                                    expect_old_value, old_value
+                                );
+                            }
+                        }
+                        return Ok(());
+                    } else {
+                        return logged_err!(
+                            self.driver.id;
+                            "CommandResult type mismatch: expect Put"
+                        );
+                    }
+                }
+
+                DriverReply::Failure => {
+                    return logged_err!(
+                        self.driver.id;
+                        "service replied unknown error"
+                    );
+                }
+
+                DriverReply::Redirect { .. } => {} // re-issue immediately
+
+                DriverReply::Timeout => {
+                    return logged_err!(
+                        self.driver.id;
+                        "client-side timeout {} ms",
+                        self.timeout.as_millis()
+                    )
                 }
             }
-            Ok(())
-        } else {
-            Err(SummersetError(
-                "CommandResult type mismatch: expect Get".into(),
-            ))
         }
     }
 
-    /// Waits for the reply of given request ID, expecting the given Put
-    /// old_value if not `None`.
-    async fn expect_put_reply(
+    /// Query the list of servers in the cluster.
+    async fn query_servers(
         &mut self,
-        req_id: RequestId,
-        expect_old_value: Option<Option<&str>>,
-        // maximum number of tries if repeatedly getting `Ok(None)` reply
-        max_tries: u8,
-    ) -> Result<(), SummersetError> {
-        let cmd_result = self.wait_reply(req_id, max_tries).await?;
-        if let CommandResult::Put { ref old_value } = cmd_result {
-            if let Some(ref expect_old_value) = expect_old_value {
-                if !Self::strings_match(old_value, expect_old_value) {
-                    return Err(SummersetError(format!(
-                        "Put old_value mismatch: expect {:?}, got {:?}",
-                        expect_old_value, old_value
-                    )));
-                }
+    ) -> Result<HashSet<ReplicaId>, SummersetError> {
+        let ctrl_stub = self.driver.ctrl_stub();
+
+        // send QueryInfo request to manager
+        let req = CtrlRequest::QueryInfo;
+        let mut sent = ctrl_stub.send_req(Some(&req))?;
+        while !sent {
+            sent = ctrl_stub.send_req(None)?;
+        }
+
+        // wait for reply from manager
+        let reply = ctrl_stub.recv_reply().await?;
+        match reply {
+            CtrlReply::QueryInfo { servers } => {
+                Ok(servers.keys().copied().collect())
             }
-            Ok(())
-        } else {
-            Err(SummersetError(
-                "CommandResult type mismatch: expect Put".into(),
-            ))
+            _ => logged_err!(self.driver.id; ""),
         }
     }
 
@@ -236,7 +249,7 @@ impl ClientTester {
         let reply = ctrl_stub.recv_reply().await?;
         match reply {
             CtrlReply::ResetServers { .. } => Ok(()),
-            _ => logged_err!("c"; "unexpected control reply type"),
+            _ => logged_err!(self.driver.id; "unexpected control reply type"),
         }
     }
 
@@ -248,15 +261,16 @@ impl ClientTester {
         // reset everything to initial state at the start of each test
         self.reset_servers(HashSet::new(), false).await?;
         self.driver.connect().await?;
-        self.cached_replies.clear();
 
         let result = match name {
             "primitive_ops" => self.test_primitive_ops().await,
             "client_reconnect" => self.test_client_reconnect().await,
-            "node_1_crash" => self.test_node_1_crash().await,
-            "node_0_crash" => self.test_node_0_crash().await,
-            "two_nodes_crash" => self.test_two_nodes_crash().await,
-            _ => return logged_err!("c"; "unrecognized test name '{}'", name),
+            "one_node_reset" => self.test_one_node_reset().await,
+            "two_nodes_reset" => self.test_two_nodes_reset().await,
+            _ => {
+                return logged_err!(self.driver.id; "unrecognized test name '{}'",
+                                                   name);
+            }
         };
 
         if let Err(ref e) = result {
@@ -318,69 +332,49 @@ impl ClientTester {
 impl ClientTester {
     /// Basic primitive operations.
     async fn test_primitive_ops(&mut self) -> Result<(), SummersetError> {
-        let mut req_id = self.issue_get("Jose")?;
-        self.expect_get_reply(req_id, Some(None), 1).await?;
+        self.checked_get("Jose", Some(None)).await?;
         let v0 = Self::gen_rand_string(8);
-        req_id = self.issue_put("Jose", &v0)?;
-        self.expect_put_reply(req_id, Some(None), 1).await?;
-        req_id = self.issue_get("Jose")?;
-        self.expect_get_reply(req_id, Some(Some(&v0)), 1).await?;
+        self.checked_put("Jose", &v0, Some(None)).await?;
+        self.checked_get("Jose", Some(Some(&v0))).await?;
         let v1 = Self::gen_rand_string(16);
-        req_id = self.issue_put("Jose", &v1)?;
-        self.expect_put_reply(req_id, Some(Some(&v0)), 1).await?;
-        req_id = self.issue_get("Jose")?;
-        self.expect_get_reply(req_id, Some(Some(&v1)), 1).await?;
+        self.checked_put("Jose", &v1, Some(Some(&v0))).await?;
+        self.checked_get("Jose", Some(Some(&v1))).await?;
         Ok(())
     }
 
     /// Client leaves and reconnects.
     async fn test_client_reconnect(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
-        let mut req_id = self.issue_put("Jose", &v)?;
-        self.expect_put_reply(req_id, Some(None), 1).await?;
-        self.driver.leave(false).await?;
-        self.driver.connect().await?;
-        req_id = self.issue_get("Jose")?;
-        self.expect_get_reply(req_id, Some(Some(&v)), 1).await?;
-        Ok(())
-    }
-
-    /// Replica node 1 crashes and restarts.
-    async fn test_node_1_crash(&mut self) -> Result<(), SummersetError> {
-        let v = Self::gen_rand_string(8);
-        let mut req_id = self.issue_put("Jose", &v)?;
-        self.expect_put_reply(req_id, Some(None), 1).await?;
+        self.checked_put("Jose", &v, Some(None)).await?;
         self.driver.leave(false).await?;
-        self.reset_servers(HashSet::from([1]), true).await?;
         self.driver.connect().await?;
-        req_id = self.issue_get("Jose")?;
-        self.expect_get_reply(req_id, Some(Some(&v)), 1).await?;
+        self.checked_get("Jose", Some(Some(&v))).await?;
         Ok(())
     }
 
-    /// Replica node 0 crashes and restarts.
-    async fn test_node_0_crash(&mut self) -> Result<(), SummersetError> {
+    /// Single replica node crashes and restarts.
+    async fn test_one_node_reset(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
-        let mut req_id = self.issue_put("Jose", &v)?;
-        self.expect_put_reply(req_id, Some(None), 1).await?;
-        self.driver.leave(false).await?;
-        self.reset_servers(HashSet::from([0]), true).await?;
-        self.driver.connect().await?;
-        req_id = self.issue_get("Jose")?;
-        self.expect_get_reply(req_id, Some(Some(&v)), 1).await?;
+        self.checked_put("Jose", &v, Some(None)).await?;
+        for s in self.query_servers().await? {
+            self.driver.leave(false).await?;
+            self.reset_servers(HashSet::from([s]), true).await?;
+            time::sleep(Duration::from_millis(100)).await;
+            self.driver.connect().await?;
+            self.checked_get("Jose", Some(Some(&v))).await?;
+        }
         Ok(())
     }
 
-    /// Two replica nodes crashes and restarts.
-    async fn test_two_nodes_crash(&mut self) -> Result<(), SummersetError> {
+    /// Two replica nodes crash and restart.
+    async fn test_two_nodes_reset(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
-        let mut req_id = self.issue_put("Jose", &v)?;
-        self.expect_put_reply(req_id, Some(None), 1).await?;
+        self.checked_put("Jose", &v, Some(None)).await?;
         self.driver.leave(false).await?;
         self.reset_servers(HashSet::from([0, 1]), true).await?;
+        time::sleep(Duration::from_millis(100)).await;
         self.driver.connect().await?;
-        req_id = self.issue_get("Jose")?;
-        self.expect_get_reply(req_id, Some(Some(&v)), 1).await?;
+        self.checked_get("Jose", Some(Some(&v))).await?;
         Ok(())
     }
 }
diff --git a/summerset_client/src/drivers/closed_loop.rs b/summerset_client/src/drivers/closed_loop.rs
index cb361cf5..a0a96e87 100644
--- a/summerset_client/src/drivers/closed_loop.rs
+++ b/summerset_client/src/drivers/closed_loop.rs
@@ -1,5 +1,7 @@
 //! Closed-loop client-side driver implementation.
 
+use crate::drivers::DriverReply;
+
 use tokio::time::{Duration, Instant};
 
 use summerset::{
@@ -83,16 +85,11 @@ impl DriverClosedLoop {
         }
     }
 
-    /// Send a Get request and wait for its reply. Returns:
-    ///   - `Ok(Some((id, Some(value), latency)))` if successful and key exists
-    ///   - `Ok(Some((id, None, latency)))` if successful and key does not exist
-    ///   - `Ok(None)` if request unsuccessful, e.g., wrong leader or timeout
-    ///   - `Err(err)` if any unexpected error occurs
+    /// Send a Get request and wait for its reply.
     pub async fn get(
         &mut self,
         key: &str,
-    ) -> Result<Option<(RequestId, Option<String>, Duration)>, SummersetError>
-    {
+    ) -> Result<DriverReply, SummersetError> {
         let req_id = self.next_req;
         self.next_req += 1;
 
@@ -107,18 +104,31 @@ impl DriverClosedLoop {
             Some(ApiReply::Reply {
                 id: reply_id,
                 result: cmd_result,
-                ..
+                redirect,
             }) => {
                 if reply_id != req_id {
                     logged_err!(self.id; "request ID mismatch: expected {}, replied {}",
                                          req_id, reply_id)
                 } else {
                     match cmd_result {
-                        None => Ok(None),
+                        None => {
+                            if let Some(server) = redirect {
+                                Ok(DriverReply::Redirect { server })
+                            } else {
+                                Ok(DriverReply::Failure)
+                            }
+                        }
+
                         Some(CommandResult::Get { value }) => {
-                            let lat = Instant::now().duration_since(issue_ts);
-                            Ok(Some((req_id, value, lat)))
+                            let latency =
+                                Instant::now().duration_since(issue_ts);
+                            Ok(DriverReply::Success {
+                                req_id,
+                                cmd_result: CommandResult::Get { value },
+                                latency,
+                            })
                         }
+
                         _ => {
                             logged_err!(self.id; "command type mismatch: expected Get")
                         }
@@ -126,23 +136,18 @@ impl DriverClosedLoop {
                 }
             }
 
-            None => Ok(None), // timed-out
+            None => Ok(DriverReply::Timeout),
 
             _ => logged_err!(self.id; "unexpected reply type received"),
         }
     }
 
-    /// Send a Put request and wait for its reply. Returns:
-    ///   - `Ok(Some((id, Some(old_value), latency)))` if successful and key exists
-    ///   - `Ok(Some((id, None, latency)))` if successful and key did not exist
-    ///   - `Ok(None)` if request unsuccessful, e.g., wrong leader or timeout
-    ///   - `Err(err)` if any unexpected error occurs
+    /// Send a Put request and wait for its reply.
     pub async fn put(
         &mut self,
         key: &str,
         value: &str,
-    ) -> Result<Option<(RequestId, Option<String>, Duration)>, SummersetError>
-    {
+    ) -> Result<DriverReply, SummersetError> {
         let req_id = self.next_req;
         self.next_req += 1;
 
@@ -160,18 +165,31 @@ impl DriverClosedLoop {
             Some(ApiReply::Reply {
                 id: reply_id,
                 result: cmd_result,
-                ..
+                redirect,
             }) => {
                 if reply_id != req_id {
                     logged_err!(self.id; "request ID mismatch: expected {}, replied {}",
                                          req_id, reply_id)
                 } else {
                     match cmd_result {
-                        None => Ok(None),
+                        None => {
+                            if let Some(server) = redirect {
+                                Ok(DriverReply::Redirect { server })
+                            } else {
+                                Ok(DriverReply::Failure)
+                            }
+                        }
+
                         Some(CommandResult::Put { old_value }) => {
-                            let lat = Instant::now().duration_since(issue_ts);
-                            Ok(Some((req_id, old_value, lat)))
+                            let latency =
+                                Instant::now().duration_since(issue_ts);
+                            Ok(DriverReply::Success {
+                                req_id,
+                                cmd_result: CommandResult::Put { old_value },
+                                latency,
+                            })
                         }
+
                         _ => {
                             logged_err!(self.id; "command type mismatch: expected Put")
                         }
@@ -179,7 +197,7 @@ impl DriverClosedLoop {
                 }
             }
 
-            None => Ok(None), // timed-out
+            None => Ok(DriverReply::Timeout),
 
             _ => logged_err!(self.id; "unexpected reply type received"),
         }
diff --git a/summerset_client/src/drivers/mod.rs b/summerset_client/src/drivers/mod.rs
index 7ad5a9b3..3ed0bc44 100644
--- a/summerset_client/src/drivers/mod.rs
+++ b/summerset_client/src/drivers/mod.rs
@@ -1,7 +1,33 @@
 //! Closed-loop & Open-loop client-side driver implementations.
 
+use tokio::time::Duration;
+
+use summerset::{ReplicaId, RequestId, CommandResult};
+
 mod closed_loop;
 mod open_loop;
 
 pub use closed_loop::DriverClosedLoop;
 pub use open_loop::DriverOpenLoop;
+
+/// Reply result type, common across the two driver styles.
+pub enum DriverReply {
+    /// Successful reply.
+    Success {
+        /// Request ID.
+        req_id: RequestId,
+        /// Command result.
+        cmd_result: CommandResult,
+        /// Latency duration.
+        latency: Duration,
+    },
+
+    /// Service indicated redirection.
+    Redirect { server: ReplicaId },
+
+    /// Unknown failure.
+    Failure,
+
+    /// Client-side timer timeout.
+    Timeout,
+}
diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs
index 057d414f..382091f1 100644
--- a/summerset_client/src/drivers/open_loop.rs
+++ b/summerset_client/src/drivers/open_loop.rs
@@ -7,12 +7,14 @@
 
 use std::collections::HashMap;
 
+use crate::drivers::DriverReply;
+
 use tokio::time::{Duration, Instant};
 
 use summerset::{
-    GenericEndpoint, ClientId, Command, CommandResult, ApiRequest, ApiReply,
-    RequestId, ClientCtrlStub, Timer, SummersetError, pf_trace, pf_debug,
-    pf_error, logged_err,
+    GenericEndpoint, ClientId, Command, ApiRequest, ApiReply, RequestId,
+    ClientCtrlStub, Timer, SummersetError, pf_trace, pf_debug, pf_error,
+    logged_err,
 };
 
 /// Open-loop driver struct.
@@ -178,43 +180,44 @@ impl DriverOpenLoop {
         }
     }
 
-    /// Waits for the next reply. Returns the request ID and:
-    ///   - `Ok(Some((id, cmd_result, latency)))` if request successful
-    ///   - `Ok(None)` if request unsuccessful, e.g., wrong leader or timeout
-    ///   - `Err(err)` if any unexpected error occurs
-    pub async fn wait_reply(
-        &mut self,
-    ) -> Result<Option<(RequestId, CommandResult, Duration)>, SummersetError>
-    {
+    /// Waits for the next reply.
+    pub async fn wait_reply(&mut self) -> Result<DriverReply, SummersetError> {
         let reply = self.recv_reply_with_timeout().await?;
         match reply {
             Some(ApiReply::Reply {
                 id: reply_id,
                 result: cmd_result,
-                ..
+                redirect,
             }) => {
                 if !self.pending_reqs.contains_key(&reply_id) {
                     logged_err!(self.id; "request ID {} not in pending set",
                                          reply_id)
                 } else {
                     let issue_ts = self.pending_reqs.remove(&reply_id).unwrap();
-                    let lat = Instant::now().duration_since(issue_ts);
+                    let latency = Instant::now().duration_since(issue_ts);
 
                     if let Some(res) = cmd_result {
-                        Ok(Some((reply_id, res, lat)))
+                        Ok(DriverReply::Success {
+                            req_id: reply_id,
+                            cmd_result: res,
+                            latency,
+                        })
+                    } else if let Some(server) = redirect {
+                        Ok(DriverReply::Redirect { server })
                     } else {
-                        Ok(None)
+                        Ok(DriverReply::Failure)
                     }
                 }
             }
 
-            None => Ok(None), // timed-out
+            None => Ok(DriverReply::Timeout),
 
             _ => logged_err!(self.id; "unexpected reply type received"),
         }
     }
 
     /// Gets a mutable reference to the endpoint's control stub.
+    #[allow(dead_code)]
     pub fn ctrl_stub(&mut self) -> &mut ClientCtrlStub {
         self.endpoint.ctrl_stub()
     }

From 71abe89bdc0da3f24296f4af85433334be1724be Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 13 Sep 2023 06:08:11 -0500
Subject: [PATCH 41/89] better client driver side API

---
 summerset_client/src/clients/tester.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index 04b71a5b..4666f309 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -226,7 +226,7 @@ impl ClientTester {
             CtrlReply::QueryInfo { servers } => {
                 Ok(servers.keys().copied().collect())
             }
-            _ => logged_err!(self.driver.id; ""),
+            _ => logged_err!(self.driver.id; "unexpected control reply type"),
         }
     }
 

From 8658d92be365c896999319eda57c7934489a0b4f Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 13 Sep 2023 11:19:48 -0500
Subject: [PATCH 42/89] add kill procs helper script

---
 scripts/kill_all_local_procs.sh | 14 ++++++++++++++
 scripts/set_tcp_buf_sizes.sh    |  2 +-
 2 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100755 scripts/kill_all_local_procs.sh

diff --git a/scripts/kill_all_local_procs.sh b/scripts/kill_all_local_procs.sh
new file mode 100755
index 00000000..a0bdc9db
--- /dev/null
+++ b/scripts/kill_all_local_procs.sh
@@ -0,0 +1,14 @@
+#! /bin/bash
+
+kill_all_matching () {
+    for pid in $(sudo pgrep -f $1)
+    do
+        sudo kill -9 $pid
+    done
+}
+
+kill_all_matching summerset_server
+kill_all_matching summerset_client
+kill_all_matching summerset_manager
+kill_all_matching local_cluster.py
+kill_all_matching local_client.py
diff --git a/scripts/set_tcp_buf_sizes.sh b/scripts/set_tcp_buf_sizes.sh
index 55d8d0a4..e95ca4e0 100755
--- a/scripts/set_tcp_buf_sizes.sh
+++ b/scripts/set_tcp_buf_sizes.sh
@@ -1,4 +1,4 @@
-#! /usr/bin/bash
+#! /bin/bash
 
 echo "Per-socket TCP send/receive buffer:"
 echo "min default max"

From b4db22d801ceae81c93d6abdc949314d1929686d Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 13 Sep 2023 11:43:46 -0500
Subject: [PATCH 43/89] minor updates to tester client

---
 summerset_client/src/clients/tester.rs | 33 ++++++++++++++++++--------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index 4666f309..607dbfb0 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -106,7 +106,7 @@ impl ClientTester {
     }
 
     /// Issues a Get request and checks its reply value against given one if
-    /// not `None`. Retries immediately upon getting redirection error.
+    /// not `None`. Retries in-place upon getting redirection error.
     async fn checked_get(
         &mut self,
         key: &str,
@@ -142,7 +142,10 @@ impl ClientTester {
                     );
                 }
 
-                DriverReply::Redirect { .. } => {} // re-issue immediately
+                DriverReply::Redirect { .. } => {
+                    time::sleep(Duration::from_millis(500)).await;
+                    // retry
+                }
 
                 DriverReply::Timeout => {
                     return logged_err!(
@@ -156,7 +159,7 @@ impl ClientTester {
     }
 
     /// Issues a Put request and checks its reply old_value against given one
-    /// if not `None`. Retries immediately upon getting redirection error.
+    /// if not `None`. Retries in-place upon getting redirection error.
     async fn checked_put(
         &mut self,
         key: &str,
@@ -194,7 +197,10 @@ impl ClientTester {
                     );
                 }
 
-                DriverReply::Redirect { .. } => {} // re-issue immediately
+                DriverReply::Redirect { .. } => {
+                    time::sleep(Duration::from_millis(500)).await;
+                    // retry
+                }
 
                 DriverReply::Timeout => {
                     return logged_err!(
@@ -359,7 +365,7 @@ impl ClientTester {
         for s in self.query_servers().await? {
             self.driver.leave(false).await?;
             self.reset_servers(HashSet::from([s]), true).await?;
-            time::sleep(Duration::from_millis(100)).await;
+            time::sleep(Duration::from_millis(500)).await;
             self.driver.connect().await?;
             self.checked_get("Jose", Some(Some(&v))).await?;
         }
@@ -370,11 +376,18 @@ impl ClientTester {
     async fn test_two_nodes_reset(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
         self.checked_put("Jose", &v, Some(None)).await?;
-        self.driver.leave(false).await?;
-        self.reset_servers(HashSet::from([0, 1]), true).await?;
-        time::sleep(Duration::from_millis(100)).await;
-        self.driver.connect().await?;
-        self.checked_get("Jose", Some(Some(&v))).await?;
+        let servers = self.query_servers().await?;
+        for &s in &servers {
+            self.driver.leave(false).await?;
+            self.reset_servers(
+                HashSet::from([s, (s + 1) % (servers.len() as u8)]),
+                true,
+            )
+            .await?;
+            time::sleep(Duration::from_millis(500)).await;
+            self.driver.connect().await?;
+            self.checked_get("Jose", Some(Some(&v))).await?;
+        }
         Ok(())
     }
 }

From 1a82a7f276c21001a85c837538b9f518331acc50 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 14 Sep 2023 14:50:45 -0500
Subject: [PATCH 44/89] add autonomous leader timeouts and step-up

---
 README.md                                 |   4 +-
 src/client/apistub.rs                     |   1 -
 src/client/ctrlstub.rs                    |   1 -
 src/manager/reactor.rs                    |   4 +-
 src/manager/reigner.rs                    |   4 +-
 src/protocols/crossword.rs                | 304 +++++++++++++++++-----
 src/protocols/multipaxos.rs               | 122 ++++-----
 src/protocols/rep_nothing.rs              |  51 ++--
 src/protocols/rs_paxos.rs                 | 304 +++++++++++++++++-----
 src/protocols/simple_push.rs              |  51 ++--
 src/server/external.rs                    |   4 +-
 src/server/transport.rs                   |   4 +-
 summerset_client/src/drivers/open_loop.rs |  18 +-
 13 files changed, 585 insertions(+), 287 deletions(-)

diff --git a/README.md b/README.md
index 16944f3a..0d008572 100644
--- a/README.md
+++ b/README.md
@@ -148,13 +148,15 @@ Complete cluster management and benchmarking scripts are available in another re
 - [x] implementation of MultiPaxos
   - [x] client-side timeout/retry logic
   - [x] state persistence & restart check
-  - [ ] automatic leader election, backoffs
+  - [x] automatic leader election, backoffs
   - [ ] snapshotting & garbage collection
   - [ ] specialize read-only commands?
   - [ ] separate commit vs. exec responses?
   - [ ] membership discovery & view changes
 - [ ] implementation of Raft
 - [ ] implementation of Crossword prototype
+  - [ ] fault recovery reads
+  - [ ] follower gossiping
 - [x] client-side utilities
   - [x] REPL-style client
   - [x] random benchmarking client
diff --git a/src/client/apistub.rs b/src/client/apistub.rs
index ea0bb14f..54fc1604 100644
--- a/src/client/apistub.rs
+++ b/src/client/apistub.rs
@@ -40,7 +40,6 @@ impl ClientApiStub {
         id: ClientId,
         addr: SocketAddr,
     ) -> Result<Self, SummersetError> {
-        pf_info!(id; "connecting to server '{}'...", addr);
         let mut stream = tcp_connect_with_retry(addr, 10).await?;
         stream.write_u64(id).await?; // send my client ID
         let (read_half, write_half) = stream.into_split();
diff --git a/src/client/ctrlstub.rs b/src/client/ctrlstub.rs
index e1a28e75..f1e79481 100644
--- a/src/client/ctrlstub.rs
+++ b/src/client/ctrlstub.rs
@@ -38,7 +38,6 @@ impl ClientCtrlStub {
     pub async fn new_by_connect(
         manager: SocketAddr,
     ) -> Result<Self, SummersetError> {
-        pf_info!("c"; "connecting to manager '{}'...", manager);
         let mut stream = TcpStream::connect(manager).await?;
         let id = stream.read_u64().await?; // receive my client ID
         let (read_half, write_half) = stream.into_split();
diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs
index e1e388c3..1942f591 100644
--- a/src/manager/reactor.rs
+++ b/src/manager/reactor.rs
@@ -324,7 +324,7 @@ impl ClientReactor {
         mut rx_reply: mpsc::UnboundedReceiver<CtrlReply>,
         tx_exit: mpsc::UnboundedSender<ClientId>,
     ) {
-        pf_debug!("m"; "client_responder thread for {} ({}) spawned", id, addr);
+        pf_debug!("m"; "client_responder thread for {} '{}' spawned", id, addr);
 
         let (mut conn_read, conn_write) = conn.into_split();
         let mut req_buf = BytesMut::with_capacity(8 + 1024);
@@ -419,7 +419,7 @@ impl ClientReactor {
         if let Err(e) = tx_exit.send(id) {
             pf_error!("m"; "error sending exit signal for {}: {}", id, e);
         }
-        pf_debug!("m"; "client_responder thread for {} ({}) exitted", id, addr);
+        pf_debug!("m"; "client_responder thread for {} '{}' exitted", id, addr);
     }
 }
 
diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs
index 02e4e4c3..b28b9262 100644
--- a/src/manager/reigner.rs
+++ b/src/manager/reigner.rs
@@ -333,7 +333,7 @@ impl ServerReigner {
         mut rx_send: mpsc::UnboundedReceiver<CtrlMsg>,
         tx_exit: mpsc::UnboundedSender<ReplicaId>,
     ) {
-        pf_debug!("m"; "server_controller thread for {} ({}) spawned", id, addr);
+        pf_debug!("m"; "server_controller thread for {} '{}' spawned", id, addr);
 
         let (mut conn_read, conn_write) = conn.into_split();
         let mut read_buf = BytesMut::new();
@@ -451,7 +451,7 @@ impl ServerReigner {
         if let Err(e) = tx_exit.send(id) {
             pf_error!("m"; "error sending exit signal for {}: {}", id, e);
         }
-        pf_debug!("m"; "server_controller thread for {} ({}) exitted", id, addr);
+        pf_debug!("m"; "server_controller thread for {} '{}' exitted", id, addr);
     }
 }
 
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 9ef9b2b5..1140a69c 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -7,7 +7,7 @@ use std::collections::HashMap;
 use std::path::Path;
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, Bitmap, RSCodeword};
+use crate::utils::{SummersetError, Bitmap, Timer, RSCodeword};
 use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
 use crate::server::{
     ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi,
@@ -17,13 +17,15 @@ use crate::server::{
 use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint};
 use crate::protocols::SmrProtocol;
 
+use rand::prelude::*;
+
 use async_trait::async_trait;
 
 use get_size::GetSize;
 
 use serde::{Serialize, Deserialize};
 
-use tokio::time::Duration;
+use tokio::time::{self, Duration, Interval, MissedTickBehavior};
 use tokio::sync::watch;
 
 use reed_solomon_erasure::galois_8::ReedSolomon;
@@ -43,6 +45,15 @@ pub struct ReplicaConfigCrossword {
     /// Whether to call `fsync()`/`fdatasync()` on logger.
     pub logger_sync: bool,
 
+    /// Min timeout of not hearing any heartbeat from leader in millisecs.
+    pub hb_hear_timeout_min: u64,
+
+    /// Max timeout of not hearing any heartbeat from leader in millisecs.
+    pub hb_hear_timeout_max: u64,
+
+    /// Interval of leader sending heartbeats to followers.
+    pub hb_send_interval_ms: u64,
+
     /// Fault-tolerance level.
     pub fault_tolerance: u8,
 
@@ -65,6 +76,9 @@ impl Default for ReplicaConfigCrossword {
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.rs_paxos.wal".into(),
             logger_sync: false,
+            hb_hear_timeout_min: 300,
+            hb_hear_timeout_max: 600,
+            hb_send_interval_ms: 50,
             fault_tolerance: 0,
             shards_per_replica: 1,
             perf_storage_a: 0,
@@ -131,6 +145,9 @@ struct Instance {
 
     /// Follower-side bookkeeping info.
     replica_bk: Option<ReplicaBookkeeping>,
+
+    /// True if from external client, else false.
+    external: bool,
 }
 
 /// Stable storage log entry type.
@@ -177,6 +194,9 @@ enum PeerMsg {
 
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
+
+    /// Leader activity heartbeat.
+    Heartbeat { ballot: Ballot },
 }
 
 /// Crossword server replica module.
@@ -214,6 +234,12 @@ pub struct CrosswordReplica {
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
+    /// Timer for hearing heartbeat from leader.
+    hb_hear_timer: Timer,
+
+    /// Interval for sending heartbeat to followers.
+    hb_send_interval: Interval,
+
     /// Do I think I am the leader?
     is_leader: bool,
 
@@ -385,7 +411,6 @@ impl CrosswordReplica {
 
         // create a new instance in the first null slot (or append a new one
         // at the end if no holes exist)
-        // TODO: maybe use a null_idx variable to better keep track of this
         let mut slot = self.insts.len();
         for s in self.commit_bar..self.insts.len() {
             if self.insts[s].status == Status::Null {
@@ -413,14 +438,12 @@ impl CrosswordReplica {
                     accept_acks: HashMap::new(),
                 }),
                 replica_bk: None,
+                external: true,
             };
             self.insts.push(new_inst);
         }
 
         // decide whether we can enter fast path for this instance
-        // TODO: remember to reset bal_prepared to 0, update bal_max_seen,
-        //       and re-handle all Preparing & Accepting instances in autonomous
-        //       Prepare initiation
         if self.bal_prepared == 0 {
             // slow case: Prepare phase not done yet. Initiate a Prepare round
             // if none is on the fly, or just wait for some Prepare reply to
@@ -702,6 +725,7 @@ impl CrosswordReplica {
                     )?,
                     leader_bk: None,
                     replica_bk: None,
+                    external: false,
                 });
             }
             let inst = &mut self.insts[slot];
@@ -875,6 +899,7 @@ impl CrosswordReplica {
                     )?,
                     leader_bk: None,
                     replica_bk: None,
+                    external: false,
                 });
             }
             let inst = &mut self.insts[slot];
@@ -1002,6 +1027,7 @@ impl CrosswordReplica {
                 )?,
                 leader_bk: None,
                 replica_bk: None,
+                external: false,
             });
         }
         let inst = &mut self.insts[slot];
@@ -1054,6 +1080,7 @@ impl CrosswordReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
+            PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot),
         }
     }
 
@@ -1075,7 +1102,7 @@ impl CrosswordReplica {
 
         // reply command result back to client
         if let ApiRequest::Req { id: req_id, .. } = req {
-            if self.external_api.has_client(client) {
+            if inst.external && self.external_api.has_client(client) {
                 self.external_api.send_reply(
                     ApiReply::Reply {
                         id: *req_id,
@@ -1113,6 +1140,111 @@ impl CrosswordReplica {
         Ok(())
     }
 
+    /// Becomes a leader, sends self-initiated Prepare messages to followers
+    /// for all in-progress instances, and starts broadcasting heartbeats.
+    fn become_a_leader(&mut self) -> Result<(), SummersetError> {
+        assert!(!self.is_leader);
+        self.is_leader = true; // this starts broadcasting heartbeats
+        pf_warn!(self.id; "becoming a leader...");
+
+        // broadcast a heartbeat right now
+        self.bcast_heartbeats()?;
+
+        // make a greater ballot number and invalidate all in-progress instances
+        self.bal_prepared = 0;
+        self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
+        self.bal_max_seen = self.bal_prep_sent;
+
+        // redo Prepare phase for all in-progress instances
+        for (slot, inst) in self.insts.iter_mut().enumerate() {
+            if inst.status < Status::Committed {
+                inst.bal = self.bal_prep_sent;
+                inst.status = Status::Preparing;
+                pf_debug!(self.id; "enter Prepare phase for slot {} bal {}",
+                                   slot, inst.bal);
+
+                // record update to largest prepare ballot
+                self.storage_hub.submit_action(
+                    Self::make_log_action_id(slot, Status::Preparing),
+                    LogAction::Append {
+                        entry: LogEntry::PrepareBal {
+                            slot,
+                            ballot: self.bal_prep_sent,
+                        },
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}",
+                                   slot, inst.bal);
+
+                // send Prepare messages to all peers
+                self.transport_hub.bcast_msg(
+                    PeerMsg::Prepare {
+                        slot,
+                        ballot: self.bal_prep_sent,
+                    },
+                    None,
+                )?;
+                pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}",
+                                   slot, inst.bal);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Broadcasts heartbeats to all replicas.
+    fn bcast_heartbeats(&mut self) -> Result<(), SummersetError> {
+        self.transport_hub.bcast_msg(
+            PeerMsg::Heartbeat {
+                ballot: self.bal_prep_sent,
+            },
+            None,
+        )?;
+        self.heard_heartbeat(self.id, self.bal_prep_sent)?;
+
+        // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
+        Ok(())
+    }
+
+    /// Chooses a random hb_hear_timeout from the min-max range and kicks off
+    /// the hb_hear_timer.
+    fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> {
+        let timeout_ms = thread_rng().gen_range(
+            self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max,
+        );
+
+        // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms);
+        self.hb_hear_timer
+            .kickoff(Duration::from_millis(timeout_ms))?;
+        Ok(())
+    }
+
+    /// Heard a heartbeat from some other replica. If the heartbeat carries a
+    /// high enough ballot number, refreshes my hearing timer and clears my
+    /// leader status if I currently think I'm a leader.
+    fn heard_heartbeat(
+        &mut self,
+        _peer: ReplicaId,
+        ballot: Ballot,
+    ) -> Result<(), SummersetError> {
+        // ignore outdated hearbeat
+        if ballot < self.bal_max_seen {
+            return Ok(());
+        }
+
+        // reset hearing timer
+        self.kickoff_hb_hear_timer()?;
+
+        // clear my leader status if it carries a higher ballot number
+        if self.is_leader && ballot > self.bal_max_seen {
+            self.is_leader = false;
+        }
+
+        // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
+        Ok(())
+    }
+
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -1187,6 +1319,7 @@ impl CrosswordReplica {
                         )?,
                         leader_bk: None,
                         replica_bk: None,
+                        external: false,
                     });
                 }
                 // update instance state
@@ -1219,6 +1352,7 @@ impl CrosswordReplica {
                         )?,
                         leader_bk: None,
                         replica_bk: None,
+                        external: false,
                     });
                 }
                 // update instance state
@@ -1347,6 +1481,8 @@ impl GenericReplica for CrosswordReplica {
         let config = parsed_config!(config_str => ReplicaConfigCrossword;
                                     batch_interval_us, max_batch_size,
                                     backer_path, logger_sync,
+                                    hb_hear_timeout_min, hb_hear_timeout_max,
+                                    hb_send_interval_ms,
                                     fault_tolerance, shards_per_replica,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
@@ -1357,6 +1493,27 @@ impl GenericReplica for CrosswordReplica {
                 config.batch_interval_us
             );
         }
+        if config.hb_hear_timeout_min < 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_min '{}'",
+                config.hb_hear_timeout_min
+            );
+        }
+        if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_max '{}'",
+                config.hb_hear_timeout_max
+            );
+        }
+        if config.hb_send_interval_ms == 0 {
+            return logged_err!(
+                id;
+                "invalid config.hb_send_interval_ms '{}'",
+                config.hb_send_interval_ms
+            );
+        }
 
         // setup state machine module
         let state_machine = StateMachine::new_and_setup(id).await?;
@@ -1437,6 +1594,10 @@ impl GenericReplica for CrosswordReplica {
         )
         .await?;
 
+        let mut hb_send_interval =
+            time::interval(Duration::from_millis(config.hb_send_interval_ms));
+        hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
         Ok(CrosswordReplica {
             id,
             population,
@@ -1449,6 +1610,8 @@ impl GenericReplica for CrosswordReplica {
             state_machine,
             storage_hub,
             transport_hub,
+            hb_hear_timer: Timer::new(),
+            hb_send_interval,
             is_leader: false,
             insts: vec![],
             bal_prep_sent: 0,
@@ -1468,10 +1631,8 @@ impl GenericReplica for CrosswordReplica {
         // recover state from durable storage log
         self.recover_from_log().await?;
 
-        // TODO: proper leader election
-        if self.id == 0 {
-            self.is_leader = true;
-        }
+        // kick off leader activity hearing timer
+        self.kickoff_hb_hear_timer()?;
 
         // main event loop
         loop {
@@ -1525,6 +1686,16 @@ impl GenericReplica for CrosswordReplica {
                     }
                 },
 
+                // leader inactivity timeout
+                _ = self.hb_hear_timer.timeout() => {
+                    self.become_a_leader()?;
+                },
+
+                // leader sending heartbeat
+                _ = self.hb_send_interval.tick(), if self.is_leader => {
+                    self.bcast_heartbeats()?;
+                }
+
                 // manager control message
                 ctrl_msg = self.control_hub.recv_ctrl() => {
                     if let Err(e) = ctrl_msg {
@@ -1584,17 +1755,17 @@ pub struct CrosswordClient {
     /// Configuration parameters struct.
     _config: ClientConfigCrossword,
 
-    /// Cached list of active servers information.
+    /// List of active servers information.
     servers: HashMap<ReplicaId, SocketAddr>,
 
-    /// Current server ID to connect to.
+    /// Current server ID to talk to.
     server_id: ReplicaId,
 
     /// Control API stub to the cluster manager.
     ctrl_stub: ClientCtrlStub,
 
     /// API stubs for communicating with servers.
-    api_stub: Option<ClientApiStub>,
+    api_stubs: HashMap<ReplicaId, ClientApiStub>,
 }
 
 #[async_trait]
@@ -1604,6 +1775,7 @@ impl GenericEndpoint for CrosswordClient {
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
         // connect to the cluster manager and get assigned a client ID
+        pf_info!("c"; "connecting to manager '{}'...", manager);
         let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
         let id = ctrl_stub.id;
 
@@ -1618,13 +1790,13 @@ impl GenericEndpoint for CrosswordClient {
             servers: HashMap::new(),
             server_id: init_server_id,
             ctrl_stub,
-            api_stub: None,
+            api_stubs: HashMap::new(),
         })
     }
 
     async fn connect(&mut self) -> Result<(), SummersetError> {
         // disallow reconnection without leaving
-        if self.api_stub.is_some() {
+        if !self.api_stubs.is_empty() {
             return logged_err!(self.id; "reconnecting without leaving");
         }
 
@@ -1638,13 +1810,13 @@ impl GenericEndpoint for CrosswordClient {
         let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
             CtrlReply::QueryInfo { servers } => {
-                // connect to the one with server ID in config
-                let api_stub = ClientApiStub::new_by_connect(
-                    self.id,
-                    servers[&self.server_id],
-                )
-                .await?;
-                self.api_stub = Some(api_stub);
+                // establish connection to all servers
+                for (&id, &server) in &servers {
+                    pf_info!(self.id; "connecting to server {} '{}'...", id, server);
+                    let api_stub =
+                        ClientApiStub::new_by_connect(self.id, server).await?;
+                    self.api_stubs.insert(id, api_stub);
+                }
                 self.servers = servers;
                 Ok(())
             }
@@ -1653,23 +1825,16 @@ impl GenericEndpoint for CrosswordClient {
     }
 
     async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> {
-        // send leave notification to current connected server
-        if let Some(mut api_stub) = self.api_stub.take() {
+        // send leave notification to all servers
+        for (id, mut api_stub) in self.api_stubs.drain() {
             let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?;
             while !sent {
                 sent = api_stub.send_req(None)?;
             }
 
-            let reply = api_stub.recv_reply().await?;
-            match reply {
-                ApiReply::Leave => {
-                    pf_info!(self.id; "left current server connection");
-                    api_stub.forget();
-                }
-                _ => {
-                    return logged_err!(self.id; "unexpected reply type received");
-                }
-            }
+            while api_stub.recv_reply().await? != ApiReply::Leave {}
+            pf_info!(self.id; "left server connection {}", id);
+            api_stub.forget();
         }
 
         // if permanently leaving, send leave notification to the manager
@@ -1680,15 +1845,8 @@ impl GenericEndpoint for CrosswordClient {
                 sent = self.ctrl_stub.send_req(None)?;
             }
 
-            let reply = self.ctrl_stub.recv_reply().await?;
-            match reply {
-                CtrlReply::Leave => {
-                    pf_info!(self.id; "left current manager connection");
-                }
-                _ => {
-                    return logged_err!(self.id; "unexpected reply type received");
-                }
-            }
+            while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {}
+            pf_info!(self.id; "left current manager connection");
         }
 
         Ok(())
@@ -1698,38 +1856,44 @@ impl GenericEndpoint for CrosswordClient {
         &mut self,
         req: Option<&ApiRequest>,
     ) -> Result<bool, SummersetError> {
-        match self.api_stub {
-            Some(ref mut api_stub) => api_stub.send_req(req),
-            None => logged_err!(self.id; "client is not set up"),
+        if self.api_stubs.contains_key(&self.server_id) {
+            self.api_stubs
+                .get_mut(&self.server_id)
+                .unwrap()
+                .send_req(req)
+        } else {
+            Err(SummersetError("client not set up".into()))
         }
     }
 
     async fn recv_reply(&mut self) -> Result<ApiReply, SummersetError> {
-        match self.api_stub {
-            Some(ref mut api_stub) => {
-                let reply = api_stub.recv_reply().await?;
-
-                if let ApiReply::Reply {
-                    ref result,
-                    ref redirect,
-                    ..
-                } = reply
-                {
-                    // if the current server redirects me to a different server
-                    if result.is_none() && redirect.is_some() {
-                        let redirect_id = redirect.unwrap();
-                        assert!(self.servers.contains_key(&redirect_id));
-                        self.leave(false).await?;
-                        self.server_id = redirect_id;
-                        self.connect().await?;
-                        pf_debug!(self.id; "redirected to replica {} '{}'",
-                                           redirect_id, self.servers[&redirect_id]);
-                    }
-                }
+        if self.api_stubs.contains_key(&self.server_id) {
+            let reply = self
+                .api_stubs
+                .get_mut(&self.server_id)
+                .unwrap()
+                .recv_reply()
+                .await?;
 
-                Ok(reply)
+            if let ApiReply::Reply {
+                ref result,
+                ref redirect,
+                ..
+            } = reply
+            {
+                // if the current server redirects me to a different server
+                if result.is_none() && redirect.is_some() {
+                    let redirect_id = redirect.unwrap();
+                    assert!(self.servers.contains_key(&redirect_id));
+                    self.server_id = redirect_id;
+                    pf_debug!(self.id; "redirected to replica {} '{}'",
+                                       redirect_id, self.servers[&redirect_id]);
+                }
             }
-            None => logged_err!(self.id; "client is not set up"),
+
+            Ok(reply)
+        } else {
+            Err(SummersetError("client not set up".into()))
         }
     }
 
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 4690c08d..130a2f45 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -137,6 +137,9 @@ struct Instance {
 
     /// Follower-side bookkeeping info.
     replica_bk: Option<ReplicaBookkeeping>,
+
+    /// True if from external client, else false.
+    external: bool,
 }
 
 /// Stable storage log entry type.
@@ -362,6 +365,7 @@ impl MultiPaxosReplica {
                     accept_acks: Bitmap::new(self.population, false),
                 }),
                 replica_bk: None,
+                external: true,
             };
             self.insts.push(new_inst);
         }
@@ -607,6 +611,7 @@ impl MultiPaxosReplica {
                     reqs: Vec::new(),
                     leader_bk: None,
                     replica_bk: None,
+                    external: false,
                 });
             }
             let inst = &mut self.insts[slot];
@@ -736,6 +741,7 @@ impl MultiPaxosReplica {
                     reqs: Vec::new(),
                     leader_bk: None,
                     replica_bk: None,
+                    external: false,
                 });
             }
             let inst = &mut self.insts[slot];
@@ -839,6 +845,7 @@ impl MultiPaxosReplica {
                 reqs: Vec::new(),
                 leader_bk: None,
                 replica_bk: None,
+                external: false,
             });
         }
         let inst = &mut self.insts[slot];
@@ -910,7 +917,7 @@ impl MultiPaxosReplica {
 
         // reply command result back to client
         if let ApiRequest::Req { id: req_id, .. } = req {
-            if self.external_api.has_client(client) {
+            if inst.external && self.external_api.has_client(client) {
                 self.external_api.send_reply(
                     ApiReply::Reply {
                         id: *req_id,
@@ -1124,6 +1131,7 @@ impl MultiPaxosReplica {
                         reqs: Vec::new(),
                         leader_bk: None,
                         replica_bk: None,
+                        external: false,
                     });
                 }
                 // update instance state
@@ -1149,6 +1157,7 @@ impl MultiPaxosReplica {
                         reqs: Vec::new(),
                         leader_bk: None,
                         replica_bk: None,
+                        external: false,
                     });
                 }
                 // update instance state
@@ -1520,17 +1529,17 @@ pub struct MultiPaxosClient {
     /// Configuration parameters struct.
     _config: ClientConfigMultiPaxos,
 
-    /// Cached list of active servers information.
+    /// List of active servers information.
     servers: HashMap<ReplicaId, SocketAddr>,
 
-    /// Current server ID to connect to.
+    /// Current server ID to talk to.
     server_id: ReplicaId,
 
     /// Control API stub to the cluster manager.
     ctrl_stub: ClientCtrlStub,
 
     /// API stubs for communicating with servers.
-    api_stub: Option<ClientApiStub>,
+    api_stubs: HashMap<ReplicaId, ClientApiStub>,
 }
 
 #[async_trait]
@@ -1540,6 +1549,7 @@ impl GenericEndpoint for MultiPaxosClient {
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
         // connect to the cluster manager and get assigned a client ID
+        pf_info!("c"; "connecting to manager '{}'...", manager);
         let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
         let id = ctrl_stub.id;
 
@@ -1554,13 +1564,13 @@ impl GenericEndpoint for MultiPaxosClient {
             servers: HashMap::new(),
             server_id: init_server_id,
             ctrl_stub,
-            api_stub: None,
+            api_stubs: HashMap::new(),
         })
     }
 
     async fn connect(&mut self) -> Result<(), SummersetError> {
         // disallow reconnection without leaving
-        if self.api_stub.is_some() {
+        if !self.api_stubs.is_empty() {
             return logged_err!(self.id; "reconnecting without leaving");
         }
 
@@ -1574,13 +1584,13 @@ impl GenericEndpoint for MultiPaxosClient {
         let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
             CtrlReply::QueryInfo { servers } => {
-                // connect to the one with server ID in config
-                let api_stub = ClientApiStub::new_by_connect(
-                    self.id,
-                    servers[&self.server_id],
-                )
-                .await?;
-                self.api_stub = Some(api_stub);
+                // establish connection to all servers
+                for (&id, &server) in &servers {
+                    pf_info!(self.id; "connecting to server {} '{}'...", id, server);
+                    let api_stub =
+                        ClientApiStub::new_by_connect(self.id, server).await?;
+                    self.api_stubs.insert(id, api_stub);
+                }
                 self.servers = servers;
                 Ok(())
             }
@@ -1589,23 +1599,16 @@ impl GenericEndpoint for MultiPaxosClient {
     }
 
     async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> {
-        // send leave notification to current connected server
-        if let Some(mut api_stub) = self.api_stub.take() {
+        // send leave notification to all servers
+        for (id, mut api_stub) in self.api_stubs.drain() {
             let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?;
             while !sent {
                 sent = api_stub.send_req(None)?;
             }
 
-            let reply = api_stub.recv_reply().await?;
-            match reply {
-                ApiReply::Leave => {
-                    pf_info!(self.id; "left current server connection");
-                    api_stub.forget();
-                }
-                _ => {
-                    return logged_err!(self.id; "unexpected reply type received");
-                }
-            }
+            while api_stub.recv_reply().await? != ApiReply::Leave {}
+            pf_info!(self.id; "left server connection {}", id);
+            api_stub.forget();
         }
 
         // if permanently leaving, send leave notification to the manager
@@ -1616,15 +1619,8 @@ impl GenericEndpoint for MultiPaxosClient {
                 sent = self.ctrl_stub.send_req(None)?;
             }
 
-            let reply = self.ctrl_stub.recv_reply().await?;
-            match reply {
-                CtrlReply::Leave => {
-                    pf_info!(self.id; "left current manager connection");
-                }
-                _ => {
-                    return logged_err!(self.id; "unexpected reply type received");
-                }
-            }
+            while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {}
+            pf_info!(self.id; "left manager connection");
         }
 
         Ok(())
@@ -1634,38 +1630,44 @@ impl GenericEndpoint for MultiPaxosClient {
         &mut self,
         req: Option<&ApiRequest>,
     ) -> Result<bool, SummersetError> {
-        match self.api_stub {
-            Some(ref mut api_stub) => api_stub.send_req(req),
-            None => logged_err!(self.id; "client is not set up"),
+        if self.api_stubs.contains_key(&self.server_id) {
+            self.api_stubs
+                .get_mut(&self.server_id)
+                .unwrap()
+                .send_req(req)
+        } else {
+            Err(SummersetError("client not set up".into()))
         }
     }
 
     async fn recv_reply(&mut self) -> Result<ApiReply, SummersetError> {
-        match self.api_stub {
-            Some(ref mut api_stub) => {
-                let reply = api_stub.recv_reply().await?;
-
-                if let ApiReply::Reply {
-                    ref result,
-                    ref redirect,
-                    ..
-                } = reply
-                {
-                    // if the current server redirects me to a different server
-                    if result.is_none() && redirect.is_some() {
-                        let redirect_id = redirect.unwrap();
-                        assert!(self.servers.contains_key(&redirect_id));
-                        self.leave(false).await?;
-                        self.server_id = redirect_id;
-                        self.connect().await?;
-                        pf_debug!(self.id; "redirected to replica {} '{}'",
-                                           redirect_id, self.servers[&redirect_id]);
-                    }
-                }
+        if self.api_stubs.contains_key(&self.server_id) {
+            let reply = self
+                .api_stubs
+                .get_mut(&self.server_id)
+                .unwrap()
+                .recv_reply()
+                .await?;
 
-                Ok(reply)
+            if let ApiReply::Reply {
+                ref result,
+                ref redirect,
+                ..
+            } = reply
+            {
+                // if the current server redirects me to a different server
+                if result.is_none() && redirect.is_some() {
+                    let redirect_id = redirect.unwrap();
+                    assert!(self.servers.contains_key(&redirect_id));
+                    self.server_id = redirect_id;
+                    pf_debug!(self.id; "redirected to replica {} '{}'",
+                                       redirect_id, self.servers[&redirect_id]);
+                }
             }
-            None => logged_err!(self.id; "client is not set up"),
+
+            Ok(reply)
+        } else {
+            Err(SummersetError("client not set up".into()))
         }
     }
 
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index 6475de8d..643cdf7a 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -218,14 +218,16 @@ impl RepNothingReplica {
         let (client, req) = &inst.reqs[cmd_idx];
         match req {
             ApiRequest::Req { id: req_id, .. } => {
-                self.external_api.send_reply(
-                    ApiReply::Reply {
-                        id: *req_id,
-                        result: Some(cmd_result),
-                        redirect: None,
-                    },
-                    *client,
-                )?;
+                if self.external_api.has_client(*client) {
+                    self.external_api.send_reply(
+                        ApiReply::Reply {
+                            id: *req_id,
+                            result: Some(cmd_result),
+                            redirect: None,
+                        },
+                        *client,
+                    )?;
+                }
             }
             _ => {
                 return logged_err!(self.id; "unknown request type at {}|{}", inst_idx, cmd_idx)
@@ -535,7 +537,7 @@ pub struct RepNothingClient {
     /// Control API stub to the cluster manager.
     ctrl_stub: ClientCtrlStub,
 
-    /// API stubs for communicating with servers.
+    /// API stub for communicating with the current server.
     api_stub: Option<ClientApiStub>,
 }
 
@@ -546,6 +548,7 @@ impl GenericEndpoint for RepNothingClient {
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
         // connect to the cluster manager and get assigned a client ID
+        pf_info!("c"; "connecting to manager '{}'...", manager);
         let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
         let id = ctrl_stub.id;
 
@@ -578,6 +581,8 @@ impl GenericEndpoint for RepNothingClient {
         match reply {
             CtrlReply::QueryInfo { servers } => {
                 // connect to the one with server ID in config
+                pf_info!(self.id; "connecting to server {} '{}'...",
+                                  self.config.server_id, servers[&self.config.server_id]);
                 let api_stub = ClientApiStub::new_by_connect(
                     self.id,
                     servers[&self.config.server_id],
@@ -598,16 +603,9 @@ impl GenericEndpoint for RepNothingClient {
                 sent = api_stub.send_req(None)?;
             }
 
-            let reply = api_stub.recv_reply().await?;
-            match reply {
-                ApiReply::Leave => {
-                    pf_info!(self.id; "left current server connection");
-                    api_stub.forget();
-                }
-                _ => {
-                    return logged_err!(self.id; "unexpected reply type received");
-                }
-            }
+            while api_stub.recv_reply().await? != ApiReply::Leave {}
+            pf_info!(self.id; "left current server connection");
+            api_stub.forget();
         }
 
         // if permanently leaving, send leave notification to the manager
@@ -618,15 +616,8 @@ impl GenericEndpoint for RepNothingClient {
                 sent = self.ctrl_stub.send_req(None)?;
             }
 
-            let reply = self.ctrl_stub.recv_reply().await?;
-            match reply {
-                CtrlReply::Leave => {
-                    pf_info!(self.id; "left current manager connection");
-                }
-                _ => {
-                    return logged_err!(self.id; "unexpected reply type received");
-                }
-            }
+            while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {}
+            pf_info!(self.id; "left manager connection");
         }
 
         Ok(())
@@ -638,14 +629,14 @@ impl GenericEndpoint for RepNothingClient {
     ) -> Result<bool, SummersetError> {
         match self.api_stub {
             Some(ref mut api_stub) => api_stub.send_req(req),
-            None => logged_err!(self.id; "client is not set up"),
+            None => Err(SummersetError("client not set up".into())),
         }
     }
 
     async fn recv_reply(&mut self) -> Result<ApiReply, SummersetError> {
         match self.api_stub {
             Some(ref mut api_stub) => api_stub.recv_reply().await,
-            None => logged_err!(self.id; "client is not set up"),
+            None => Err(SummersetError("client not set up".into())),
         }
     }
 
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index ed66f6ea..172c9771 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -7,7 +7,7 @@ use std::collections::HashMap;
 use std::path::Path;
 use std::net::SocketAddr;
 
-use crate::utils::{SummersetError, Bitmap, RSCodeword};
+use crate::utils::{SummersetError, Bitmap, Timer, RSCodeword};
 use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
 use crate::server::{
     ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi,
@@ -17,13 +17,15 @@ use crate::server::{
 use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint};
 use crate::protocols::SmrProtocol;
 
+use rand::prelude::*;
+
 use async_trait::async_trait;
 
 use get_size::GetSize;
 
 use serde::{Serialize, Deserialize};
 
-use tokio::time::Duration;
+use tokio::time::{self, Duration, Interval, MissedTickBehavior};
 use tokio::sync::watch;
 
 use reed_solomon_erasure::galois_8::ReedSolomon;
@@ -43,6 +45,15 @@ pub struct ReplicaConfigRSPaxos {
     /// Whether to call `fsync()`/`fdatasync()` on logger.
     pub logger_sync: bool,
 
+    /// Min timeout of not hearing any heartbeat from leader in millisecs.
+    pub hb_hear_timeout_min: u64,
+
+    /// Max timeout of not hearing any heartbeat from leader in millisecs.
+    pub hb_hear_timeout_max: u64,
+
+    /// Interval of leader sending heartbeats to followers.
+    pub hb_send_interval_ms: u64,
+
     /// Fault-tolerance level.
     pub fault_tolerance: u8,
 
@@ -61,6 +72,9 @@ impl Default for ReplicaConfigRSPaxos {
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.rs_paxos.wal".into(),
             logger_sync: false,
+            hb_hear_timeout_min: 300,
+            hb_hear_timeout_max: 600,
+            hb_send_interval_ms: 50,
             fault_tolerance: 0,
             perf_storage_a: 0,
             perf_storage_b: 0,
@@ -125,6 +139,9 @@ struct Instance {
 
     /// Follower-side bookkeeping info.
     replica_bk: Option<ReplicaBookkeeping>,
+
+    /// True if from external client, else false.
+    external: bool,
 }
 
 /// Stable storage log entry type.
@@ -171,6 +188,9 @@ enum PeerMsg {
 
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
+
+    /// Leader activity heartbeat.
+    Heartbeat { ballot: Ballot },
 }
 
 /// RSPaxos server replica module.
@@ -208,6 +228,12 @@ pub struct RSPaxosReplica {
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
+    /// Timer for hearing heartbeat from leader.
+    hb_hear_timer: Timer,
+
+    /// Interval for sending heartbeat to followers.
+    hb_send_interval: Interval,
+
     /// Do I think I am the leader?
     is_leader: bool,
 
@@ -327,7 +353,6 @@ impl RSPaxosReplica {
 
         // create a new instance in the first null slot (or append a new one
         // at the end if no holes exist)
-        // TODO: maybe use a null_idx variable to better keep track of this
         let mut slot = self.insts.len();
         for s in self.commit_bar..self.insts.len() {
             if self.insts[s].status == Status::Null {
@@ -355,14 +380,12 @@ impl RSPaxosReplica {
                     accept_acks: Bitmap::new(self.population, false),
                 }),
                 replica_bk: None,
+                external: true,
             };
             self.insts.push(new_inst);
         }
 
         // decide whether we can enter fast path for this instance
-        // TODO: remember to reset bal_prepared to 0, update bal_max_seen,
-        //       and re-handle all Preparing & Accepting instances in autonomous
-        //       Prepare initiation
         if self.bal_prepared == 0 {
             // slow case: Prepare phase not done yet. Initiate a Prepare round
             // if none is on the fly, or just wait for some Prepare reply to
@@ -629,6 +652,7 @@ impl RSPaxosReplica {
                     )?,
                     leader_bk: None,
                     replica_bk: None,
+                    external: false,
                 });
             }
             let inst = &mut self.insts[slot];
@@ -788,6 +812,7 @@ impl RSPaxosReplica {
                     )?,
                     leader_bk: None,
                     replica_bk: None,
+                    external: false,
                 });
             }
             let inst = &mut self.insts[slot];
@@ -902,6 +927,7 @@ impl RSPaxosReplica {
                 )?,
                 leader_bk: None,
                 replica_bk: None,
+                external: false,
             });
         }
         let inst = &mut self.insts[slot];
@@ -954,6 +980,7 @@ impl RSPaxosReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
+            PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot),
         }
     }
 
@@ -975,7 +1002,7 @@ impl RSPaxosReplica {
 
         // reply command result back to client
         if let ApiRequest::Req { id: req_id, .. } = req {
-            if self.external_api.has_client(client) {
+            if inst.external && self.external_api.has_client(client) {
                 self.external_api.send_reply(
                     ApiReply::Reply {
                         id: *req_id,
@@ -1013,6 +1040,111 @@ impl RSPaxosReplica {
         Ok(())
     }
 
+    /// Becomes a leader, sends self-initiated Prepare messages to followers
+    /// for all in-progress instances, and starts broadcasting heartbeats.
+    fn become_a_leader(&mut self) -> Result<(), SummersetError> {
+        assert!(!self.is_leader);
+        self.is_leader = true; // this starts broadcasting heartbeats
+        pf_warn!(self.id; "becoming a leader...");
+
+        // broadcast a heartbeat right now
+        self.bcast_heartbeats()?;
+
+        // make a greater ballot number and invalidate all in-progress instances
+        self.bal_prepared = 0;
+        self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
+        self.bal_max_seen = self.bal_prep_sent;
+
+        // redo Prepare phase for all in-progress instances
+        for (slot, inst) in self.insts.iter_mut().enumerate() {
+            if inst.status < Status::Committed {
+                inst.bal = self.bal_prep_sent;
+                inst.status = Status::Preparing;
+                pf_debug!(self.id; "enter Prepare phase for slot {} bal {}",
+                                   slot, inst.bal);
+
+                // record update to largest prepare ballot
+                self.storage_hub.submit_action(
+                    Self::make_log_action_id(slot, Status::Preparing),
+                    LogAction::Append {
+                        entry: LogEntry::PrepareBal {
+                            slot,
+                            ballot: self.bal_prep_sent,
+                        },
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                pf_trace!(self.id; "submitted PrepareBal log action for slot {} bal {}",
+                                   slot, inst.bal);
+
+                // send Prepare messages to all peers
+                self.transport_hub.bcast_msg(
+                    PeerMsg::Prepare {
+                        slot,
+                        ballot: self.bal_prep_sent,
+                    },
+                    None,
+                )?;
+                pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}",
+                                   slot, inst.bal);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Broadcasts heartbeats to all replicas.
+    fn bcast_heartbeats(&mut self) -> Result<(), SummersetError> {
+        self.transport_hub.bcast_msg(
+            PeerMsg::Heartbeat {
+                ballot: self.bal_prep_sent,
+            },
+            None,
+        )?;
+        self.heard_heartbeat(self.id, self.bal_prep_sent)?;
+
+        // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
+        Ok(())
+    }
+
+    /// Chooses a random hb_hear_timeout from the min-max range and kicks off
+    /// the hb_hear_timer.
+    fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> {
+        let timeout_ms = thread_rng().gen_range(
+            self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max,
+        );
+
+        // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms);
+        self.hb_hear_timer
+            .kickoff(Duration::from_millis(timeout_ms))?;
+        Ok(())
+    }
+
+    /// Heard a heartbeat from some other replica. If the heartbeat carries a
+    /// high enough ballot number, refreshes my hearing timer and clears my
+    /// leader status if I currently think I'm a leader.
+    fn heard_heartbeat(
+        &mut self,
+        _peer: ReplicaId,
+        ballot: Ballot,
+    ) -> Result<(), SummersetError> {
+        // ignore outdated hearbeat
+        if ballot < self.bal_max_seen {
+            return Ok(());
+        }
+
+        // reset hearing timer
+        self.kickoff_hb_hear_timer()?;
+
+        // clear my leader status if it carries a higher ballot number
+        if self.is_leader && ballot > self.bal_max_seen {
+            self.is_leader = false;
+        }
+
+        // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
+        Ok(())
+    }
+
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -1087,6 +1219,7 @@ impl RSPaxosReplica {
                         )?,
                         leader_bk: None,
                         replica_bk: None,
+                        external: false,
                     });
                 }
                 // update instance state
@@ -1119,6 +1252,7 @@ impl RSPaxosReplica {
                         )?,
                         leader_bk: None,
                         replica_bk: None,
+                        external: false,
                     });
                 }
                 // update instance state
@@ -1246,7 +1380,9 @@ impl GenericReplica for RSPaxosReplica {
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigRSPaxos;
                                     batch_interval_us, max_batch_size,
-                                    backer_path, logger_sync, fault_tolerance,
+                                    backer_path, logger_sync,
+                                    hb_hear_timeout_min, hb_hear_timeout_max,
+                                    hb_send_interval_ms, fault_tolerance,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
         if config.batch_interval_us == 0 {
@@ -1256,6 +1392,27 @@ impl GenericReplica for RSPaxosReplica {
                 config.batch_interval_us
             );
         }
+        if config.hb_hear_timeout_min < 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_min '{}'",
+                config.hb_hear_timeout_min
+            );
+        }
+        if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_max '{}'",
+                config.hb_hear_timeout_max
+            );
+        }
+        if config.hb_send_interval_ms == 0 {
+            return logged_err!(
+                id;
+                "invalid config.hb_send_interval_ms '{}'",
+                config.hb_send_interval_ms
+            );
+        }
 
         // setup state machine module
         let state_machine = StateMachine::new_and_setup(id).await?;
@@ -1330,6 +1487,10 @@ impl GenericReplica for RSPaxosReplica {
         )
         .await?;
 
+        let mut hb_send_interval =
+            time::interval(Duration::from_millis(config.hb_send_interval_ms));
+        hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
         Ok(RSPaxosReplica {
             id,
             population,
@@ -1342,6 +1503,8 @@ impl GenericReplica for RSPaxosReplica {
             state_machine,
             storage_hub,
             transport_hub,
+            hb_hear_timer: Timer::new(),
+            hb_send_interval,
             is_leader: false,
             insts: vec![],
             bal_prep_sent: 0,
@@ -1361,10 +1524,8 @@ impl GenericReplica for RSPaxosReplica {
         // recover state from durable storage log
         self.recover_from_log().await?;
 
-        // TODO: proper leader election
-        if self.id == 0 {
-            self.is_leader = true;
-        }
+        // kick off leader activity hearing timer
+        self.kickoff_hb_hear_timer()?;
 
         // main event loop
         loop {
@@ -1418,6 +1579,16 @@ impl GenericReplica for RSPaxosReplica {
                     }
                 },
 
+                // leader inactivity timeout
+                _ = self.hb_hear_timer.timeout() => {
+                    self.become_a_leader()?;
+                },
+
+                // leader sending heartbeat
+                _ = self.hb_send_interval.tick(), if self.is_leader => {
+                    self.bcast_heartbeats()?;
+                }
+
                 // manager control message
                 ctrl_msg = self.control_hub.recv_ctrl() => {
                     if let Err(e) = ctrl_msg {
@@ -1477,17 +1648,17 @@ pub struct RSPaxosClient {
     /// Configuration parameters struct.
     _config: ClientConfigRSPaxos,
 
-    /// Cached list of active servers information.
+    /// List of active servers information.
     servers: HashMap<ReplicaId, SocketAddr>,
 
-    /// Current server ID to connect to.
+    /// Current server ID to talk to.
     server_id: ReplicaId,
 
     /// Control API stub to the cluster manager.
     ctrl_stub: ClientCtrlStub,
 
     /// API stubs for communicating with servers.
-    api_stub: Option<ClientApiStub>,
+    api_stubs: HashMap<ReplicaId, ClientApiStub>,
 }
 
 #[async_trait]
@@ -1497,6 +1668,7 @@ impl GenericEndpoint for RSPaxosClient {
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
         // connect to the cluster manager and get assigned a client ID
+        pf_info!("c"; "connecting to manager '{}'...", manager);
         let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
         let id = ctrl_stub.id;
 
@@ -1511,13 +1683,13 @@ impl GenericEndpoint for RSPaxosClient {
             servers: HashMap::new(),
             server_id: init_server_id,
             ctrl_stub,
-            api_stub: None,
+            api_stubs: HashMap::new(),
         })
     }
 
     async fn connect(&mut self) -> Result<(), SummersetError> {
         // disallow reconnection without leaving
-        if self.api_stub.is_some() {
+        if !self.api_stubs.is_empty() {
             return logged_err!(self.id; "reconnecting without leaving");
         }
 
@@ -1531,13 +1703,13 @@ impl GenericEndpoint for RSPaxosClient {
         let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
             CtrlReply::QueryInfo { servers } => {
-                // connect to the one with server ID in config
-                let api_stub = ClientApiStub::new_by_connect(
-                    self.id,
-                    servers[&self.server_id],
-                )
-                .await?;
-                self.api_stub = Some(api_stub);
+                // establish connection to all servers
+                for (&id, &server) in &servers {
+                    pf_info!(self.id; "connecting to server {} '{}'...", id, server);
+                    let api_stub =
+                        ClientApiStub::new_by_connect(self.id, server).await?;
+                    self.api_stubs.insert(id, api_stub);
+                }
                 self.servers = servers;
                 Ok(())
             }
@@ -1547,22 +1719,15 @@ impl GenericEndpoint for RSPaxosClient {
 
     async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> {
         // send leave notification to current connected server
-        if let Some(mut api_stub) = self.api_stub.take() {
+        for (id, mut api_stub) in self.api_stubs.drain() {
             let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?;
             while !sent {
                 sent = api_stub.send_req(None)?;
             }
 
-            let reply = api_stub.recv_reply().await?;
-            match reply {
-                ApiReply::Leave => {
-                    pf_info!(self.id; "left current server connection");
-                    api_stub.forget();
-                }
-                _ => {
-                    return logged_err!(self.id; "unexpected reply type received");
-                }
-            }
+            while api_stub.recv_reply().await? != ApiReply::Leave {}
+            pf_info!(self.id; "left server connection {}", id);
+            api_stub.forget();
         }
 
         // if permanently leaving, send leave notification to the manager
@@ -1573,15 +1738,8 @@ impl GenericEndpoint for RSPaxosClient {
                 sent = self.ctrl_stub.send_req(None)?;
             }
 
-            let reply = self.ctrl_stub.recv_reply().await?;
-            match reply {
-                CtrlReply::Leave => {
-                    pf_info!(self.id; "left current manager connection");
-                }
-                _ => {
-                    return logged_err!(self.id; "unexpected reply type received");
-                }
-            }
+            while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {}
+            pf_info!(self.id; "left current manager connection");
         }
 
         Ok(())
@@ -1591,38 +1749,44 @@ impl GenericEndpoint for RSPaxosClient {
         &mut self,
         req: Option<&ApiRequest>,
     ) -> Result<bool, SummersetError> {
-        match self.api_stub {
-            Some(ref mut api_stub) => api_stub.send_req(req),
-            None => logged_err!(self.id; "client is not set up"),
+        if self.api_stubs.contains_key(&self.server_id) {
+            self.api_stubs
+                .get_mut(&self.server_id)
+                .unwrap()
+                .send_req(req)
+        } else {
+            Err(SummersetError("client not set up".into()))
         }
     }
 
     async fn recv_reply(&mut self) -> Result<ApiReply, SummersetError> {
-        match self.api_stub {
-            Some(ref mut api_stub) => {
-                let reply = api_stub.recv_reply().await?;
-
-                if let ApiReply::Reply {
-                    ref result,
-                    ref redirect,
-                    ..
-                } = reply
-                {
-                    // if the current server redirects me to a different server
-                    if result.is_none() && redirect.is_some() {
-                        let redirect_id = redirect.unwrap();
-                        assert!(self.servers.contains_key(&redirect_id));
-                        self.leave(false).await?;
-                        self.server_id = redirect_id;
-                        self.connect().await?;
-                        pf_debug!(self.id; "redirected to replica {} '{}'",
-                                           redirect_id, self.servers[&redirect_id]);
-                    }
-                }
+        if self.api_stubs.contains_key(&self.server_id) {
+            let reply = self
+                .api_stubs
+                .get_mut(&self.server_id)
+                .unwrap()
+                .recv_reply()
+                .await?;
 
-                Ok(reply)
+            if let ApiReply::Reply {
+                ref result,
+                ref redirect,
+                ..
+            } = reply
+            {
+                // if the current server redirects me to a different server
+                if result.is_none() && redirect.is_some() {
+                    let redirect_id = redirect.unwrap();
+                    assert!(self.servers.contains_key(&redirect_id));
+                    self.server_id = redirect_id;
+                    pf_debug!(self.id; "redirected to replica {} '{}'",
+                                       redirect_id, self.servers[&redirect_id]);
+                }
             }
-            None => logged_err!(self.id; "client is not set up"),
+
+            Ok(reply)
+        } else {
+            Err(SummersetError("client not set up".into()))
         }
     }
 
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index 7260bc27..593cbd27 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -379,14 +379,16 @@ impl SimplePushReplica {
             let (client, req) = &inst.reqs[cmd_idx];
             match req {
                 ApiRequest::Req { id: req_id, .. } => {
-                    self.external_api.send_reply(
-                        ApiReply::Reply {
-                            id: *req_id,
-                            result: Some(cmd_result),
-                            redirect: None,
-                        },
-                        *client,
-                    )?;
+                    if self.external_api.has_client(*client) {
+                        self.external_api.send_reply(
+                            ApiReply::Reply {
+                                id: *req_id,
+                                result: Some(cmd_result),
+                                redirect: None,
+                            },
+                            *client,
+                        )?;
+                    }
                 }
                 _ => {
                     return logged_err!(self.id; "unknown request type at {}|{}", inst_idx, cmd_idx)
@@ -762,7 +764,7 @@ pub struct SimplePushClient {
     /// Control API stub to the cluster manager.
     ctrl_stub: ClientCtrlStub,
 
-    /// API stubs for communicating with servers.
+    /// API stub for communicating with the current server.
     api_stub: Option<ClientApiStub>,
 }
 
@@ -773,6 +775,7 @@ impl GenericEndpoint for SimplePushClient {
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
         // connect to the cluster manager and get assigned a client ID
+        pf_info!("c"; "connecting to manager '{}'...", manager);
         let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
         let id = ctrl_stub.id;
 
@@ -805,6 +808,8 @@ impl GenericEndpoint for SimplePushClient {
         match reply {
             CtrlReply::QueryInfo { servers } => {
                 // connect to the one with server ID in config
+                pf_info!(self.id; "connecting to server {} '{}'...",
+                                  self.config.server_id, servers[&self.config.server_id]);
                 let api_stub = ClientApiStub::new_by_connect(
                     self.id,
                     servers[&self.config.server_id],
@@ -825,16 +830,9 @@ impl GenericEndpoint for SimplePushClient {
                 sent = api_stub.send_req(None)?;
             }
 
-            let reply = api_stub.recv_reply().await?;
-            match reply {
-                ApiReply::Leave => {
-                    pf_info!(self.id; "left current server connection");
-                    api_stub.forget();
-                }
-                _ => {
-                    return logged_err!(self.id; "unexpected reply type received");
-                }
-            }
+            while api_stub.recv_reply().await? != ApiReply::Leave {}
+            pf_info!(self.id; "left current server connection");
+            api_stub.forget();
         }
 
         // if permanently leaving, send leave notification to the manager
@@ -845,15 +843,8 @@ impl GenericEndpoint for SimplePushClient {
                 sent = self.ctrl_stub.send_req(None)?;
             }
 
-            let reply = self.ctrl_stub.recv_reply().await?;
-            match reply {
-                CtrlReply::Leave => {
-                    pf_info!(self.id; "left current manager connection");
-                }
-                _ => {
-                    return logged_err!(self.id; "unexpected reply type received");
-                }
-            }
+            while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {}
+            pf_info!(self.id; "left manager connection");
         }
 
         Ok(())
@@ -865,14 +856,14 @@ impl GenericEndpoint for SimplePushClient {
     ) -> Result<bool, SummersetError> {
         match self.api_stub {
             Some(ref mut api_stub) => api_stub.send_req(req),
-            None => logged_err!(self.id; "client is not set up"),
+            None => Err(SummersetError("client not set up".into())),
         }
     }
 
     async fn recv_reply(&mut self) -> Result<ApiReply, SummersetError> {
         match self.api_stub {
             Some(ref mut api_stub) => api_stub.recv_reply().await,
-            None => logged_err!(self.id; "client is not set up"),
+            None => Err(SummersetError("client not set up".into())),
         }
     }
 
diff --git a/src/server/external.rs b/src/server/external.rs
index 769c1cd7..cc820c00 100644
--- a/src/server/external.rs
+++ b/src/server/external.rs
@@ -378,7 +378,7 @@ impl ExternalApi {
         mut rx_reply: mpsc::UnboundedReceiver<ApiReply>,
         tx_exit: mpsc::UnboundedSender<ClientId>,
     ) {
-        pf_debug!(me; "client_servant thread for {} ({}) spawned", id, addr);
+        pf_debug!(me; "client_servant thread for {} '{}' spawned", id, addr);
 
         let (mut conn_read, conn_write) = conn.into_split();
         let mut req_buf = BytesMut::with_capacity(8 + 1024);
@@ -477,7 +477,7 @@ impl ExternalApi {
         if let Err(e) = tx_exit.send(id) {
             pf_error!(me; "error sending exit signal for {}: {}", id, e);
         }
-        pf_debug!(me; "client_servant thread for {} ({}) exitted", id, addr);
+        pf_debug!(me; "client_servant thread for {} '{}' exitted", id, addr);
     }
 }
 
diff --git a/src/server/transport.rs b/src/server/transport.rs
index a6a30ec8..18caa475 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -554,7 +554,7 @@ where
         tx_recv: mpsc::UnboundedSender<(ReplicaId, PeerMessage<Msg>)>,
         tx_exit: mpsc::UnboundedSender<ReplicaId>,
     ) {
-        pf_debug!(me; "peer_messenger thread for {} ({}) spawned", id, addr);
+        pf_debug!(me; "peer_messenger thread for {} '{}' spawned", id, addr);
 
         let (mut conn_read, conn_write) = conn.into_split();
         let mut read_buf = BytesMut::with_capacity(8 + 1024);
@@ -680,7 +680,7 @@ where
         if let Err(e) = tx_exit.send(id) {
             pf_error!(me; "error sending exit signal for {}: {}", id, e);
         }
-        pf_debug!(me; "peer_messenger thread for {} ({}) exitted", id, addr);
+        pf_debug!(me; "peer_messenger thread for {} '{}' exitted", id, addr);
     }
 }
 
diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs
index 382091f1..8e49c107 100644
--- a/summerset_client/src/drivers/open_loop.rs
+++ b/summerset_client/src/drivers/open_loop.rs
@@ -13,8 +13,7 @@ use tokio::time::{Duration, Instant};
 
 use summerset::{
     GenericEndpoint, ClientId, Command, ApiRequest, ApiReply, RequestId,
-    ClientCtrlStub, Timer, SummersetError, pf_trace, pf_debug, pf_error,
-    logged_err,
+    ClientCtrlStub, Timer, SummersetError, pf_debug, pf_error, logged_err,
 };
 
 /// Open-loop driver struct.
@@ -61,24 +60,11 @@ impl DriverOpenLoop {
         self.endpoint.connect().await
     }
 
-    /// Waits for all pending replies to be received, then sends leave
-    /// notification and forgets about the current TCP connections. The leave
-    /// action is left synchronous.
+    /// Sends leave notification and forgets about the current TCP connections.
     pub async fn leave(
         &mut self,
         permanent: bool,
     ) -> Result<(), SummersetError> {
-        // loop until all pending replies have been received
-        while self.should_retry {
-            pf_trace!(self.id; "retrying last issue at leave");
-            self.issue_retry()?;
-        }
-        while !self.pending_reqs.is_empty() {
-            pf_trace!(self.id; "pending {} requests at leave",
-                               self.pending_reqs.len());
-            self.wait_reply().await?;
-        }
-
         self.endpoint.leave(permanent).await
     }
 

From 8beefa7228538d91ffde966544d5578e02822f04 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 14 Sep 2023 15:55:28 -0500
Subject: [PATCH 45/89] add leader info to manager for testing

---
 src/manager/clusman.rs                 | 44 +++++++++++++--
 src/manager/reactor.rs                 | 40 ++++++-------
 src/manager/reigner.rs                 |  3 +
 src/protocols/crossword.rs             | 11 +++-
 src/protocols/multipaxos.rs            | 11 +++-
 src/protocols/rep_nothing.rs           |  4 +-
 src/protocols/rs_paxos.rs              | 11 +++-
 src/protocols/simple_push.rs           |  4 +-
 summerset_client/src/clients/tester.rs | 77 ++++++++++++++++++--------
 9 files changed, 148 insertions(+), 57 deletions(-)

diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index 2a523972..62b39a60 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -14,7 +14,6 @@ use crate::protocols::SmrProtocol;
 use tokio::sync::{mpsc, watch};
 
 /// Information about an active server.
-// TODO: maybe add things like leader info, etc.
 #[derive(Debug, Clone)]
 struct ServerInfo {
     /// The server's client-facing API address.
@@ -22,6 +21,9 @@ struct ServerInfo {
 
     /// The server's internal peer-peer API address.
     p2p_addr: SocketAddr,
+
+    /// This server is a leader (leader could be non-unique).
+    is_leader: bool,
 }
 
 /// Standalone cluster manager oracle.
@@ -192,11 +194,39 @@ impl ClusterManager {
         )?;
 
         // save new server's info
-        self.server_info
-            .insert(server, ServerInfo { api_addr, p2p_addr });
+        self.server_info.insert(
+            server,
+            ServerInfo {
+                api_addr,
+                p2p_addr,
+                is_leader: false,
+            },
+        );
         Ok(())
     }
 
+    /// Handler of LeaderStatus message.
+    fn handle_leader_status(
+        &mut self,
+        server: ReplicaId,
+        step_up: bool,
+    ) -> Result<(), SummersetError> {
+        if !self.server_info.contains_key(&server) {
+            return logged_err!("m"; "leader status got unknown ID: {}", server);
+        }
+
+        // update this server's info
+        let info = self.server_info.get_mut(&server).unwrap();
+        if step_up && info.is_leader {
+            logged_err!("m"; "server {} is already marked as leader", server)
+        } else if !step_up && !info.is_leader {
+            logged_err!("m"; "server {} is already marked as non-leader", server)
+        } else {
+            info.is_leader = step_up;
+            Ok(())
+        }
+    }
+
     /// Synthesized handler of server-initiated control messages.
     async fn handle_ctrl_msg(
         &mut self,
@@ -220,6 +250,10 @@ impl ClusterManager {
                 )?;
             }
 
+            CtrlMsg::LeaderStatus { step_up } => {
+                self.handle_leader_status(server, step_up)?;
+            }
+
             _ => {} // ignore all other types
         }
 
@@ -235,10 +269,10 @@ impl ClusterManager {
         client: ClientId,
     ) -> Result<(), SummersetError> {
         // gather public addresses of all active servers
-        let servers: HashMap<ReplicaId, SocketAddr> = self
+        let servers: HashMap<ReplicaId, (SocketAddr, bool)> = self
             .server_info
             .iter()
-            .map(|(&server, info)| (server, info.api_addr))
+            .map(|(&server, info)| (server, (info.api_addr, info.is_leader)))
             .collect();
         self.client_reactor
             .send_reply(CtrlReply::QueryInfo { servers }, client)
diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs
index 1942f591..4a2f5bd6 100644
--- a/src/manager/reactor.rs
+++ b/src/manager/reactor.rs
@@ -20,7 +20,6 @@ use tokio::sync::mpsc;
 use tokio::task::JoinHandle;
 
 /// Control event request from client.
-// TODO: maybe add things like leader info, etc.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub enum CtrlRequest {
     /// Query the set of active servers and their info.
@@ -43,7 +42,8 @@ pub enum CtrlRequest {
 pub enum CtrlReply {
     /// Reply to server info query.
     QueryInfo {
-        servers: HashMap<ReplicaId, SocketAddr>,
+        /// Map from replica ID -> (addr, is_leader).
+        servers: HashMap<ReplicaId, (SocketAddr, bool)>,
     },
 
     /// Reply to server reset request.
@@ -448,9 +448,9 @@ mod reactor_tests {
             // send reply to client
             reactor.send_reply(
                 CtrlReply::QueryInfo {
-                    servers: HashMap::<ReplicaId, SocketAddr>::from([
-                        (0, "127.0.0.1:53700".parse()?),
-                        (1, "127.0.0.1:53701".parse()?),
+                    servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
+                        (0, ("127.0.0.1:53700".parse()?, true)),
+                        (1, ("127.0.0.1:53701".parse()?, false)),
                     ]),
                 },
                 client,
@@ -467,9 +467,9 @@ mod reactor_tests {
         assert_eq!(
             ctrl_stub.recv_reply().await?,
             CtrlReply::QueryInfo {
-                servers: HashMap::<ReplicaId, SocketAddr>::from([
-                    (0, "127.0.0.1:53700".parse()?),
-                    (1, "127.0.0.1:53701".parse()?),
+                servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
+                    (0, ("127.0.0.1:53700".parse()?, true)),
+                    (1, ("127.0.0.1:53701".parse()?, false)),
                 ]),
             }
         );
@@ -492,9 +492,9 @@ mod reactor_tests {
             assert_eq!(
                 ctrl_stub.recv_reply().await?,
                 CtrlReply::QueryInfo {
-                    servers: HashMap::<ReplicaId, SocketAddr>::from([
-                        (0, "127.0.0.1:54700".parse()?),
-                        (1, "127.0.0.1:54701".parse()?),
+                    servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
+                        (0, ("127.0.0.1:54700".parse()?, true)),
+                        (1, ("127.0.0.1:54701".parse()?, false)),
                     ]),
                 }
             );
@@ -512,9 +512,9 @@ mod reactor_tests {
             assert_eq!(
                 ctrl_stub.recv_reply().await?,
                 CtrlReply::QueryInfo {
-                    servers: HashMap::<ReplicaId, SocketAddr>::from([
-                        (0, "127.0.0.1:54710".parse()?),
-                        (1, "127.0.0.1:54711".parse()?),
+                    servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
+                        (0, ("127.0.0.1:54710".parse()?, true)),
+                        (1, ("127.0.0.1:54711".parse()?, false)),
                     ]),
                 }
             );
@@ -531,9 +531,9 @@ mod reactor_tests {
         // send reply to client
         reactor.send_reply(
             CtrlReply::QueryInfo {
-                servers: HashMap::<ReplicaId, SocketAddr>::from([
-                    (0, "127.0.0.1:54700".parse()?),
-                    (1, "127.0.0.1:54701".parse()?),
+                servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
+                    (0, ("127.0.0.1:54700".parse()?, true)),
+                    (1, ("127.0.0.1:54701".parse()?, false)),
                 ]),
             },
             client,
@@ -546,9 +546,9 @@ mod reactor_tests {
         // send reply to new client
         reactor.send_reply(
             CtrlReply::QueryInfo {
-                servers: HashMap::<ReplicaId, SocketAddr>::from([
-                    (0, "127.0.0.1:54710".parse()?),
-                    (1, "127.0.0.1:54711".parse()?),
+                servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
+                    (0, ("127.0.0.1:54710".parse()?, true)),
+                    (1, ("127.0.0.1:54711".parse()?, false)),
                 ]),
             },
             client2,
diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs
index b28b9262..a5a04450 100644
--- a/src/manager/reigner.rs
+++ b/src/manager/reigner.rs
@@ -39,6 +39,9 @@ pub enum CtrlMsg {
         to_peers: HashMap<ReplicaId, SocketAddr>,
     },
 
+    /// Server -> Manager: tell the manager that I steped-up/down as leader.
+    LeaderStatus { step_up: bool },
+
     /// Manager -> Server: reset to initial state. If durable is false, cleans
     /// durable storage state as well.
     ResetState { durable: bool },
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 1140a69c..9d4c3100 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -1145,6 +1145,8 @@ impl CrosswordReplica {
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
         assert!(!self.is_leader);
         self.is_leader = true; // this starts broadcasting heartbeats
+        self.control_hub
+            .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
         pf_warn!(self.id; "becoming a leader...");
 
         // broadcast a heartbeat right now
@@ -1239,6 +1241,8 @@ impl CrosswordReplica {
         // clear my leader status if it carries a higher ballot number
         if self.is_leader && ballot > self.bal_max_seen {
             self.is_leader = false;
+            self.control_hub
+                .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
         }
 
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
@@ -1811,13 +1815,16 @@ impl GenericEndpoint for CrosswordClient {
         match reply {
             CtrlReply::QueryInfo { servers } => {
                 // establish connection to all servers
-                for (&id, &server) in &servers {
+                self.servers = servers
+                    .into_iter()
+                    .map(|(id, info)| (id, info.0))
+                    .collect();
+                for (&id, &server) in &self.servers {
                     pf_info!(self.id; "connecting to server {} '{}'...", id, server);
                     let api_stub =
                         ClientApiStub::new_by_connect(self.id, server).await?;
                     self.api_stubs.insert(id, api_stub);
                 }
-                self.servers = servers;
                 Ok(())
             }
             _ => logged_err!(self.id; "unexpected reply type received"),
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 130a2f45..520ff556 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -960,6 +960,8 @@ impl MultiPaxosReplica {
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
         assert!(!self.is_leader);
         self.is_leader = true; // this starts broadcasting heartbeats
+        self.control_hub
+            .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
         pf_warn!(self.id; "becoming a leader...");
 
         // broadcast a heartbeat right now
@@ -1054,6 +1056,8 @@ impl MultiPaxosReplica {
         // clear my leader status if it carries a higher ballot number
         if self.is_leader && ballot > self.bal_max_seen {
             self.is_leader = false;
+            self.control_hub
+                .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
         }
 
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
@@ -1585,13 +1589,16 @@ impl GenericEndpoint for MultiPaxosClient {
         match reply {
             CtrlReply::QueryInfo { servers } => {
                 // establish connection to all servers
-                for (&id, &server) in &servers {
+                self.servers = servers
+                    .into_iter()
+                    .map(|(id, info)| (id, info.0))
+                    .collect();
+                for (&id, &server) in &self.servers {
                     pf_info!(self.id; "connecting to server {} '{}'...", id, server);
                     let api_stub =
                         ClientApiStub::new_by_connect(self.id, server).await?;
                     self.api_stubs.insert(id, api_stub);
                 }
-                self.servers = servers;
                 Ok(())
             }
             _ => logged_err!(self.id; "unexpected reply type received"),
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index 643cdf7a..ba73a21a 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -582,10 +582,10 @@ impl GenericEndpoint for RepNothingClient {
             CtrlReply::QueryInfo { servers } => {
                 // connect to the one with server ID in config
                 pf_info!(self.id; "connecting to server {} '{}'...",
-                                  self.config.server_id, servers[&self.config.server_id]);
+                                  self.config.server_id, servers[&self.config.server_id].0);
                 let api_stub = ClientApiStub::new_by_connect(
                     self.id,
-                    servers[&self.config.server_id],
+                    servers[&self.config.server_id].0,
                 )
                 .await?;
                 self.api_stub = Some(api_stub);
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 172c9771..ce987f33 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -1045,6 +1045,8 @@ impl RSPaxosReplica {
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
         assert!(!self.is_leader);
         self.is_leader = true; // this starts broadcasting heartbeats
+        self.control_hub
+            .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
         pf_warn!(self.id; "becoming a leader...");
 
         // broadcast a heartbeat right now
@@ -1139,6 +1141,8 @@ impl RSPaxosReplica {
         // clear my leader status if it carries a higher ballot number
         if self.is_leader && ballot > self.bal_max_seen {
             self.is_leader = false;
+            self.control_hub
+                .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
         }
 
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
@@ -1704,13 +1708,16 @@ impl GenericEndpoint for RSPaxosClient {
         match reply {
             CtrlReply::QueryInfo { servers } => {
                 // establish connection to all servers
-                for (&id, &server) in &servers {
+                self.servers = servers
+                    .into_iter()
+                    .map(|(id, info)| (id, info.0))
+                    .collect();
+                for (&id, &server) in &self.servers {
                     pf_info!(self.id; "connecting to server {} '{}'...", id, server);
                     let api_stub =
                         ClientApiStub::new_by_connect(self.id, server).await?;
                     self.api_stubs.insert(id, api_stub);
                 }
-                self.servers = servers;
                 Ok(())
             }
             _ => logged_err!(self.id; "unexpected reply type received"),
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index 593cbd27..56b28414 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -809,10 +809,10 @@ impl GenericEndpoint for SimplePushClient {
             CtrlReply::QueryInfo { servers } => {
                 // connect to the one with server ID in config
                 pf_info!(self.id; "connecting to server {} '{}'...",
-                                  self.config.server_id, servers[&self.config.server_id]);
+                                  self.config.server_id, servers[&self.config.server_id].0);
                 let api_stub = ClientApiStub::new_by_connect(
                     self.id,
-                    servers[&self.config.server_id],
+                    servers[&self.config.server_id].0,
                 )
                 .await?;
                 self.api_stub = Some(api_stub);
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index 607dbfb0..a3c1ab64 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -1,6 +1,6 @@
 //! Correctness testing client using closed-loop driver.
 
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 
 use crate::drivers::{DriverReply, DriverClosedLoop};
 
@@ -27,7 +27,8 @@ lazy_static! {
     static ref ALL_TESTS: Vec<(&'static str, bool)> = vec![
         ("primitive_ops", true),
         ("client_reconnect", true),
-        ("one_node_reset", true),
+        ("leader_node_reset", true),
+        ("non_leader_reset", true),
         ("two_nodes_reset", false)
     ];
 }
@@ -213,10 +214,11 @@ impl ClientTester {
         }
     }
 
-    /// Query the list of servers in the cluster.
+    /// Query the list of servers in the cluster. Returns a map from replica ID
+    /// -> is_leader status.
     async fn query_servers(
         &mut self,
-    ) -> Result<HashSet<ReplicaId>, SummersetError> {
+    ) -> Result<HashMap<ReplicaId, bool>, SummersetError> {
         let ctrl_stub = self.driver.ctrl_stub();
 
         // send QueryInfo request to manager
@@ -230,7 +232,7 @@ impl ClientTester {
         let reply = ctrl_stub.recv_reply().await?;
         match reply {
             CtrlReply::QueryInfo { servers } => {
-                Ok(servers.keys().copied().collect())
+                Ok(servers.into_iter().map(|(id, info)| (id, info.1)).collect())
             }
             _ => logged_err!(self.driver.id; "unexpected control reply type"),
         }
@@ -271,7 +273,8 @@ impl ClientTester {
         let result = match name {
             "primitive_ops" => self.test_primitive_ops().await,
             "client_reconnect" => self.test_client_reconnect().await,
-            "one_node_reset" => self.test_one_node_reset().await,
+            "leader_node_reset" => self.test_leader_node_reset().await,
+            "non_leader_reset" => self.test_non_leader_reset().await,
             "two_nodes_reset" => self.test_two_nodes_reset().await,
             _ => {
                 return logged_err!(self.driver.id; "unrecognized test name '{}'",
@@ -358,32 +361,62 @@ impl ClientTester {
         Ok(())
     }
 
-    /// Single replica node crashes and restarts.
-    async fn test_one_node_reset(&mut self) -> Result<(), SummersetError> {
+    /// Single leader replica node crashes and restarts.
+    async fn test_leader_node_reset(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
         self.checked_put("Jose", &v, Some(None)).await?;
-        for s in self.query_servers().await? {
-            self.driver.leave(false).await?;
-            self.reset_servers(HashSet::from([s]), true).await?;
-            time::sleep(Duration::from_millis(500)).await;
-            self.driver.connect().await?;
-            self.checked_get("Jose", Some(Some(&v))).await?;
+        for (s, is_leader) in self.query_servers().await? {
+            if is_leader {
+                self.driver.leave(false).await?;
+                self.reset_servers(HashSet::from([s]), true).await?;
+                time::sleep(Duration::from_millis(500)).await;
+                self.driver.connect().await?;
+                self.checked_get("Jose", Some(Some(&v))).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    /// Single leader replica node crashes and restarts.
+    async fn test_non_leader_reset(&mut self) -> Result<(), SummersetError> {
+        let v = Self::gen_rand_string(8);
+        self.checked_put("Jose", &v, Some(None)).await?;
+        for (s, is_leader) in self.query_servers().await? {
+            if !is_leader {
+                self.driver.leave(false).await?;
+                self.reset_servers(HashSet::from([s]), true).await?;
+                time::sleep(Duration::from_millis(500)).await;
+                self.driver.connect().await?;
+                self.checked_get("Jose", Some(Some(&v))).await?;
+                break;
+            }
         }
         Ok(())
     }
 
-    /// Two replica nodes crash and restart.
+    /// Two replica nodes (leader + non-leader) crash and restart.
     async fn test_two_nodes_reset(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
         self.checked_put("Jose", &v, Some(None)).await?;
-        let servers = self.query_servers().await?;
-        for &s in &servers {
+        let mut resets = HashSet::new();
+        let (mut l, mut nl) = (false, false);
+        for (s, is_leader) in self.query_servers().await? {
+            if !l && is_leader {
+                resets.insert(s);
+                l = true;
+            }
+            if !nl && !is_leader {
+                resets.insert(s);
+                nl = true;
+            }
+            if l && nl {
+                break;
+            }
+        }
+        if resets.len() == 2 {
             self.driver.leave(false).await?;
-            self.reset_servers(
-                HashSet::from([s, (s + 1) % (servers.len() as u8)]),
-                true,
-            )
-            .await?;
+            self.reset_servers(resets, true).await?;
             time::sleep(Duration::from_millis(500)).await;
             self.driver.connect().await?;
             self.checked_get("Jose", Some(Some(&v))).await?;

From 3a3e00d88e0b94e4299265fa6b144439dfeff6b0 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 14 Sep 2023 21:24:01 -0500
Subject: [PATCH 46/89] add pause & resume control requests

---
 src/manager/clusman.rs                 |  94 ++++++++++++++++++++-
 src/manager/reactor.rs                 |  18 ++++
 src/manager/reigner.rs                 |  14 +++-
 src/protocols/crossword.rs             |  56 +++++++++++--
 src/protocols/multipaxos.rs            |  56 +++++++++++--
 src/protocols/rep_nothing.rs           |  43 ++++++++--
 src/protocols/rs_paxos.rs              |  56 +++++++++++--
 src/protocols/simple_push.rs           |  45 ++++++++--
 summerset_client/src/clients/tester.rs | 110 ++++++++++++++++++++++---
 9 files changed, 442 insertions(+), 50 deletions(-)

diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index 62b39a60..0eeb1f1f 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -22,8 +22,11 @@ struct ServerInfo {
     /// The server's internal peer-peer API address.
     p2p_addr: SocketAddr,
 
-    /// This server is a leader (leader could be non-unique).
+    /// This server is a leader? (leader could be non-unique)
     is_leader: bool,
+
+    /// This server is currently paused?
+    is_paused: bool,
 }
 
 /// Standalone cluster manager oracle.
@@ -200,6 +203,7 @@ impl ClusterManager {
                 api_addr,
                 p2p_addr,
                 is_leader: false,
+                is_paused: false,
             },
         );
         Ok(())
@@ -332,6 +336,86 @@ impl ClusterManager {
         )
     }
 
+    /// Handler of client PauseServers request.
+    async fn handle_client_pause_servers(
+        &mut self,
+        client: ClientId,
+        servers: HashSet<ReplicaId>,
+    ) -> Result<(), SummersetError> {
+        let mut servers: Vec<ReplicaId> = if servers.is_empty() {
+            // all active servers
+            self.server_info.keys().copied().collect()
+        } else {
+            servers.into_iter().collect()
+        };
+
+        // pause specified server(s)
+        let mut pause_done = HashSet::new();
+        while let Some(s) = servers.pop() {
+            // send puase server control message to server
+            self.server_reigner.send_ctrl(CtrlMsg::Pause, s)?;
+
+            // set the is_paused flag
+            assert!(self.server_info.contains_key(&s));
+            self.server_info.get_mut(&s).unwrap().is_paused = true;
+
+            // wait for dummy reply
+            let (_, reply) = self.server_reigner.recv_ctrl().await?;
+            if reply != CtrlMsg::PauseReply {
+                return logged_err!("m"; "unexpected reply type received");
+            }
+
+            pause_done.insert(s);
+        }
+
+        self.client_reactor.send_reply(
+            CtrlReply::PauseServers {
+                servers: pause_done,
+            },
+            client,
+        )
+    }
+
+    /// Handler of client ResumeServers request.
+    async fn handle_client_resume_servers(
+        &mut self,
+        client: ClientId,
+        servers: HashSet<ReplicaId>,
+    ) -> Result<(), SummersetError> {
+        let mut servers: Vec<ReplicaId> = if servers.is_empty() {
+            // all active servers
+            self.server_info.keys().copied().collect()
+        } else {
+            servers.into_iter().collect()
+        };
+
+        // resume specified server(s)
+        let mut resume_done = HashSet::new();
+        while let Some(s) = servers.pop() {
+            // send puase server control message to server
+            self.server_reigner.send_ctrl(CtrlMsg::Resume, s)?;
+
+            // clear the is_paused flag
+            assert!(self.server_info.contains_key(&s));
+            self.server_info.get_mut(&s).unwrap().is_paused = false;
+
+            // wait for dummy reply
+            let (_, reply) = self.server_reigner.recv_ctrl().await?;
+            if reply != CtrlMsg::ResumeReply {
+                return logged_err!("m"; "unexpected reply type received");
+            }
+
+            resume_done.insert(s);
+        }
+
+        self.client_reactor.send_reply(
+            CtrlReply::ResumeServers {
+                servers: resume_done,
+            },
+            client,
+        )
+    }
+
     /// Synthesized handler of client-initiated control requests.
     async fn handle_ctrl_req(
         &mut self,
@@ -349,6 +433,14 @@ impl ClusterManager {
                     .await?;
             }
 
+            CtrlRequest::PauseServers { servers } => {
+                self.handle_client_pause_servers(client, servers).await?;
+            }
+
+            CtrlRequest::ResumeServers { servers } => {
+                self.handle_client_resume_servers(client, servers).await?;
+            }
+
             _ => {} // ignore all other types
         }
 
diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs
index 4a2f5bd6..4df04e36 100644
--- a/src/manager/reactor.rs
+++ b/src/manager/reactor.rs
@@ -33,6 +33,18 @@ pub enum CtrlRequest {
         durable: bool,
     },
 
+    /// Pause the specified server(s)' event loop execution.
+    PauseServers {
+        /// IDs of servers to pause. If empty, pauses all active servers.
+        servers: HashSet<ReplicaId>,
+    },
+
+    /// Resume the specified server(s)' event loop execution.
+    ResumeServers {
+        /// IDs of servers to resume. If empty, resumes all active servers.
+        servers: HashSet<ReplicaId>,
+    },
+
     /// Client leave notification.
     Leave,
 }
@@ -49,6 +61,12 @@ pub enum CtrlReply {
     /// Reply to server reset request.
     ResetServers { servers: HashSet<ReplicaId> },
 
+    /// Reply to server pause request.
+    PauseServers { servers: HashSet<ReplicaId> },
+
+    /// Reply to server resume request.
+    ResumeServers { servers: HashSet<ReplicaId> },
+
     /// Reply to client leave notification.
     Leave,
 }
diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs
index a5a04450..c3551e30 100644
--- a/src/manager/reigner.rs
+++ b/src/manager/reigner.rs
@@ -21,7 +21,7 @@ use tokio::task::JoinHandle;
 
 /// Control message from/to servers. Control traffic could be bidirectional:
 /// some initiated by the manager and some by servers.
-// TODO: add pause, resume, leader change, membership change, etc.
+// TODO: later add leader change, membership change, etc.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub enum CtrlMsg {
     /// Server -> Manager: new server up, requesting a list of peers' addresses
@@ -46,6 +46,18 @@ pub enum CtrlMsg {
     /// durable storage state as well.
     ResetState { durable: bool },
 
+    /// Manager -> Server: pause server event loop execution.
+    Pause,
+
+    /// Server -> Manager: dummy pause reply.
+    PauseReply,
+
+    /// Manager -> Server: resume server event loop execution.
+    Resume,
+
+    /// Server -> Manager: dummy resume reply.
+    ResumeReply,
+
     /// Server -> Manager: leave notification.
     Leave,
 
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 9d4c3100..cdfe2517 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -1147,7 +1147,7 @@ impl CrosswordReplica {
         self.is_leader = true; // this starts broadcasting heartbeats
         self.control_hub
             .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
-        pf_warn!(self.id; "becoming a leader...");
+        pf_info!(self.id; "becoming a leader...");
 
         // broadcast a heartbeat right now
         self.bcast_heartbeats()?;
@@ -1243,6 +1243,7 @@ impl CrosswordReplica {
             self.is_leader = false;
             self.control_hub
                 .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
+            pf_info!(self.id; "no longer a leader...");
         }
 
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
@@ -1287,20 +1288,56 @@ impl CrosswordReplica {
         Ok(())
     }
 
+    /// Handler of Pause control message.
+    fn handle_ctrl_pause(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got pause req");
+        *paused = true;
+        self.control_hub.send_ctrl(CtrlMsg::PauseReply)?;
+        Ok(())
+    }
+
+    /// Handler of Resume control message.
+    fn handle_ctrl_resume(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got resume req");
+        *paused = false;
+        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
+
+        // reset leader heartbeat timer
+        self.kickoff_hb_hear_timer()?;
+
+        Ok(())
+    }
+
     /// Synthesized handler of manager control messages. If ok, returns
     /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
     /// decides to shutdown completely, and `None` if not terminating.
     async fn handle_ctrl_msg(
         &mut self,
         msg: CtrlMsg,
+        paused: &mut bool,
     ) -> Result<Option<bool>, SummersetError> {
-        // TODO: fill this when more control message types added
         match msg {
             CtrlMsg::ResetState { durable } => {
                 self.handle_ctrl_reset_state(durable).await?;
                 Ok(Some(true))
             }
 
+            CtrlMsg::Pause => {
+                self.handle_ctrl_pause(paused)?;
+                Ok(None)
+            }
+
+            CtrlMsg::Resume => {
+                self.handle_ctrl_resume(paused)?;
+                Ok(None)
+            }
+
             _ => Ok(None), // ignore all other types
         }
     }
@@ -1639,10 +1676,11 @@ impl GenericReplica for CrosswordReplica {
         self.kickoff_hb_hear_timer()?;
 
         // main event loop
+        let mut paused = false;
         loop {
             tokio::select! {
                 // client request batch
-                req_batch = self.external_api.get_req_batch() => {
+                req_batch = self.external_api.get_req_batch(), if !paused => {
                     if let Err(e) = req_batch {
                         pf_error!(self.id; "error getting req batch: {}", e);
                         continue;
@@ -1654,7 +1692,7 @@ impl GenericReplica for CrosswordReplica {
                 },
 
                 // durable logging result
-                log_result = self.storage_hub.get_result() => {
+                log_result = self.storage_hub.get_result(), if !paused => {
                     if let Err(e) = log_result {
                         pf_error!(self.id; "error getting log result: {}", e);
                         continue;
@@ -1667,7 +1705,7 @@ impl GenericReplica for CrosswordReplica {
                 },
 
                 // message from peer
-                msg = self.transport_hub.recv_msg() => {
+                msg = self.transport_hub.recv_msg(), if !paused => {
                     if let Err(e) = msg {
                         pf_error!(self.id; "error receiving peer msg: {}", e);
                         continue;
@@ -1679,7 +1717,7 @@ impl GenericReplica for CrosswordReplica {
                 }
 
                 // state machine execution result
-                cmd_result = self.state_machine.get_result() => {
+                cmd_result = self.state_machine.get_result(), if !paused => {
                     if let Err(e) = cmd_result {
                         pf_error!(self.id; "error getting cmd result: {}", e);
                         continue;
@@ -1691,12 +1729,12 @@ impl GenericReplica for CrosswordReplica {
                 },
 
                 // leader inactivity timeout
-                _ = self.hb_hear_timer.timeout() => {
+                _ = self.hb_hear_timer.timeout(), if !paused => {
                     self.become_a_leader()?;
                 },
 
                 // leader sending heartbeat
-                _ = self.hb_send_interval.tick(), if self.is_leader => {
+                _ = self.hb_send_interval.tick(), if !paused && self.is_leader => {
                     self.bcast_heartbeats()?;
                 }
 
@@ -1707,7 +1745,7 @@ impl GenericReplica for CrosswordReplica {
                         continue;
                     }
                     let ctrl_msg = ctrl_msg.unwrap();
-                    match self.handle_ctrl_msg(ctrl_msg).await {
+                    match self.handle_ctrl_msg(ctrl_msg, &mut paused).await {
                         Ok(terminate) => {
                             if let Some(restart) = terminate {
                                 pf_warn!(
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 520ff556..759e7cf3 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -962,7 +962,7 @@ impl MultiPaxosReplica {
         self.is_leader = true; // this starts broadcasting heartbeats
         self.control_hub
             .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
-        pf_warn!(self.id; "becoming a leader...");
+        pf_info!(self.id; "becoming a leader...");
 
         // broadcast a heartbeat right now
         self.bcast_heartbeats()?;
@@ -1058,6 +1058,7 @@ impl MultiPaxosReplica {
             self.is_leader = false;
             self.control_hub
                 .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
+            pf_info!(self.id; "no longer a leader...");
         }
 
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
@@ -1102,20 +1103,56 @@ impl MultiPaxosReplica {
         Ok(())
     }
 
+    /// Handler of Pause control message.
+    fn handle_ctrl_pause(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got pause req");
+        *paused = true;
+        self.control_hub.send_ctrl(CtrlMsg::PauseReply)?;
+        Ok(())
+    }
+
+    /// Handler of Resume control message.
+    fn handle_ctrl_resume(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got resume req");
+        *paused = false;
+        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
+
+        // reset leader heartbeat timer
+        self.kickoff_hb_hear_timer()?;
+
+        Ok(())
+    }
+
     /// Synthesized handler of manager control messages. If ok, returns
     /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
     /// decides to shutdown completely, and `None` if not terminating.
     async fn handle_ctrl_msg(
         &mut self,
         msg: CtrlMsg,
+        paused: &mut bool,
     ) -> Result<Option<bool>, SummersetError> {
-        // TODO: fill this when more control message types added
         match msg {
             CtrlMsg::ResetState { durable } => {
                 self.handle_ctrl_reset_state(durable).await?;
                 Ok(Some(true))
             }
 
+            CtrlMsg::Pause => {
+                self.handle_ctrl_pause(paused)?;
+                Ok(None)
+            }
+
+            CtrlMsg::Resume => {
+                self.handle_ctrl_resume(paused)?;
+                Ok(None)
+            }
+
             _ => Ok(None), // ignore all other types
         }
     }
@@ -1413,10 +1450,11 @@ impl GenericReplica for MultiPaxosReplica {
         self.kickoff_hb_hear_timer()?;
 
         // main event loop
+        let mut paused = false;
         loop {
             tokio::select! {
                 // client request batch
-                req_batch = self.external_api.get_req_batch() => {
+                req_batch = self.external_api.get_req_batch(), if !paused => {
                     if let Err(e) = req_batch {
                         pf_error!(self.id; "error getting req batch: {}", e);
                         continue;
@@ -1428,7 +1466,7 @@ impl GenericReplica for MultiPaxosReplica {
                 },
 
                 // durable logging result
-                log_result = self.storage_hub.get_result() => {
+                log_result = self.storage_hub.get_result(), if !paused => {
                     if let Err(e) = log_result {
                         pf_error!(self.id; "error getting log result: {}", e);
                         continue;
@@ -1441,7 +1479,7 @@ impl GenericReplica for MultiPaxosReplica {
                 },
 
                 // message from peer
-                msg = self.transport_hub.recv_msg() => {
+                msg = self.transport_hub.recv_msg(), if !paused => {
                     if let Err(e) = msg {
                         pf_error!(self.id; "error receiving peer msg: {}", e);
                         continue;
@@ -1453,7 +1491,7 @@ impl GenericReplica for MultiPaxosReplica {
                 }
 
                 // state machine execution result
-                cmd_result = self.state_machine.get_result() => {
+                cmd_result = self.state_machine.get_result(), if !paused => {
                     if let Err(e) = cmd_result {
                         pf_error!(self.id; "error getting cmd result: {}", e);
                         continue;
@@ -1465,12 +1503,12 @@ impl GenericReplica for MultiPaxosReplica {
                 },
 
                 // leader inactivity timeout
-                _ = self.hb_hear_timer.timeout() => {
+                _ = self.hb_hear_timer.timeout(), if !paused => {
                     self.become_a_leader()?;
                 },
 
                 // leader sending heartbeat
-                _ = self.hb_send_interval.tick(), if self.is_leader => {
+                _ = self.hb_send_interval.tick(), if !paused && self.is_leader => {
                     self.bcast_heartbeats()?;
                 }
 
@@ -1481,7 +1519,7 @@ impl GenericReplica for MultiPaxosReplica {
                         continue;
                     }
                     let ctrl_msg = ctrl_msg.unwrap();
-                    match self.handle_ctrl_msg(ctrl_msg).await {
+                    match self.handle_ctrl_msg(ctrl_msg, &mut paused).await {
                         Ok(terminate) => {
                             if let Some(restart) = terminate {
                                 pf_warn!(
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index ba73a21a..97469f5f 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -272,20 +272,52 @@ impl RepNothingReplica {
         Ok(())
     }
 
+    /// Handler of Pause control message.
+    fn handle_ctrl_pause(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got pause req");
+        *paused = true;
+        self.control_hub.send_ctrl(CtrlMsg::PauseReply)?;
+        Ok(())
+    }
+
+    /// Handler of Resume control message.
+    fn handle_ctrl_resume(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got resume req");
+        *paused = false;
+        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
+        Ok(())
+    }
+
     /// Synthesized handler of manager control messages. If ok, returns
     /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
     /// decides to shutdown completely, and `None` if not terminating.
     async fn handle_ctrl_msg(
         &mut self,
         msg: CtrlMsg,
+        paused: &mut bool,
     ) -> Result<Option<bool>, SummersetError> {
-        // TODO: fill this when more control message types added
         match msg {
             CtrlMsg::ResetState { durable } => {
                 self.handle_ctrl_reset_state(durable).await?;
                 Ok(Some(true))
             }
 
+            CtrlMsg::Pause => {
+                self.handle_ctrl_pause(paused)?;
+                Ok(None)
+            }
+
+            CtrlMsg::Resume => {
+                self.handle_ctrl_resume(paused)?;
+                Ok(None)
+            }
+
             _ => Ok(None), // ignore all other types
         }
     }
@@ -437,10 +469,11 @@ impl GenericReplica for RepNothingReplica {
         self.recover_from_log().await?;
 
         // main event loop
+        let mut paused = false;
         loop {
             tokio::select! {
                 // client request batch
-                req_batch = self.external_api.get_req_batch() => {
+                req_batch = self.external_api.get_req_batch(), if !paused => {
                     if let Err(e) = req_batch {
                         pf_error!(self.id; "error getting req batch: {}", e);
                         continue;
@@ -452,7 +485,7 @@ impl GenericReplica for RepNothingReplica {
                 },
 
                 // durable logging result
-                log_result = self.storage_hub.get_result() => {
+                log_result = self.storage_hub.get_result(), if !paused => {
                     if let Err(e) = log_result {
                         pf_error!(self.id; "error getting log result: {}", e);
                         continue;
@@ -464,7 +497,7 @@ impl GenericReplica for RepNothingReplica {
                 },
 
                 // state machine execution result
-                cmd_result = self.state_machine.get_result() => {
+                cmd_result = self.state_machine.get_result(), if !paused => {
                     if let Err(e) = cmd_result {
                         pf_error!(self.id; "error getting cmd result: {}", e);
                         continue;
@@ -482,7 +515,7 @@ impl GenericReplica for RepNothingReplica {
                         continue;
                     }
                     let ctrl_msg = ctrl_msg.unwrap();
-                    match self.handle_ctrl_msg(ctrl_msg).await {
+                    match self.handle_ctrl_msg(ctrl_msg, &mut paused).await {
                         Ok(terminate) => {
                             if let Some(restart) = terminate {
                                 pf_warn!(
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index ce987f33..d963a7b8 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -1047,7 +1047,7 @@ impl RSPaxosReplica {
         self.is_leader = true; // this starts broadcasting heartbeats
         self.control_hub
             .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
-        pf_warn!(self.id; "becoming a leader...");
+        pf_info!(self.id; "becoming a leader...");
 
         // broadcast a heartbeat right now
         self.bcast_heartbeats()?;
@@ -1143,6 +1143,7 @@ impl RSPaxosReplica {
             self.is_leader = false;
             self.control_hub
                 .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
+            pf_info!(self.id; "no longer a leader...");
         }
 
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
@@ -1187,20 +1188,56 @@ impl RSPaxosReplica {
         Ok(())
     }
 
+    /// Handler of Pause control message.
+    fn handle_ctrl_pause(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got pause req");
+        *paused = true;
+        self.control_hub.send_ctrl(CtrlMsg::PauseReply)?;
+        Ok(())
+    }
+
+    /// Handler of Resume control message.
+    fn handle_ctrl_resume(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got resume req");
+        *paused = false;
+        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
+
+        // reset leader heartbeat timer
+        self.kickoff_hb_hear_timer()?;
+
+        Ok(())
+    }
+
     /// Synthesized handler of manager control messages. If ok, returns
     /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
     /// decides to shutdown completely, and `None` if not terminating.
     async fn handle_ctrl_msg(
         &mut self,
         msg: CtrlMsg,
+        paused: &mut bool,
     ) -> Result<Option<bool>, SummersetError> {
-        // TODO: fill this when more control message types added
         match msg {
             CtrlMsg::ResetState { durable } => {
                 self.handle_ctrl_reset_state(durable).await?;
                 Ok(Some(true))
             }
 
+            CtrlMsg::Pause => {
+                self.handle_ctrl_pause(paused)?;
+                Ok(None)
+            }
+
+            CtrlMsg::Resume => {
+                self.handle_ctrl_resume(paused)?;
+                Ok(None)
+            }
+
             _ => Ok(None), // ignore all other types
         }
     }
@@ -1532,10 +1569,11 @@ impl GenericReplica for RSPaxosReplica {
         self.kickoff_hb_hear_timer()?;
 
         // main event loop
+        let mut paused = false;
         loop {
             tokio::select! {
                 // client request batch
-                req_batch = self.external_api.get_req_batch() => {
+                req_batch = self.external_api.get_req_batch(), if !paused => {
                     if let Err(e) = req_batch {
                         pf_error!(self.id; "error getting req batch: {}", e);
                         continue;
@@ -1547,7 +1585,7 @@ impl GenericReplica for RSPaxosReplica {
                 },
 
                 // durable logging result
-                log_result = self.storage_hub.get_result() => {
+                log_result = self.storage_hub.get_result(), if !paused => {
                     if let Err(e) = log_result {
                         pf_error!(self.id; "error getting log result: {}", e);
                         continue;
@@ -1560,7 +1598,7 @@ impl GenericReplica for RSPaxosReplica {
                 },
 
                 // message from peer
-                msg = self.transport_hub.recv_msg() => {
+                msg = self.transport_hub.recv_msg(), if !paused => {
                     if let Err(e) = msg {
                         pf_error!(self.id; "error receiving peer msg: {}", e);
                         continue;
@@ -1572,7 +1610,7 @@ impl GenericReplica for RSPaxosReplica {
                 }
 
                 // state machine execution result
-                cmd_result = self.state_machine.get_result() => {
+                cmd_result = self.state_machine.get_result(), if !paused => {
                     if let Err(e) = cmd_result {
                         pf_error!(self.id; "error getting cmd result: {}", e);
                         continue;
@@ -1584,12 +1622,12 @@ impl GenericReplica for RSPaxosReplica {
                 },
 
                 // leader inactivity timeout
-                _ = self.hb_hear_timer.timeout() => {
+                _ = self.hb_hear_timer.timeout(), if !paused => {
                     self.become_a_leader()?;
                 },
 
                 // leader sending heartbeat
-                _ = self.hb_send_interval.tick(), if self.is_leader => {
+                _ = self.hb_send_interval.tick(), if !paused && self.is_leader => {
                     self.bcast_heartbeats()?;
                 }
 
@@ -1600,7 +1638,7 @@ impl GenericReplica for RSPaxosReplica {
                         continue;
                     }
                     let ctrl_msg = ctrl_msg.unwrap();
-                    match self.handle_ctrl_msg(ctrl_msg).await {
+                    match self.handle_ctrl_msg(ctrl_msg, &mut paused).await {
                         Ok(terminate) => {
                             if let Some(restart) = terminate {
                                 pf_warn!(
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index 56b28414..c42be3bc 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -437,20 +437,52 @@ impl SimplePushReplica {
         Ok(())
     }
 
+    /// Handler of Pause control message.
+    fn handle_ctrl_pause(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got pause req");
+        *paused = true;
+        self.control_hub.send_ctrl(CtrlMsg::PauseReply)?;
+        Ok(())
+    }
+
+    /// Handler of Resume control message.
+    fn handle_ctrl_resume(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got resume req");
+        *paused = false;
+        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
+        Ok(())
+    }
+
     /// Synthesized handler of manager control messages. If ok, returns
     /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
     /// decides to shutdown completely, and `None` if not terminating.
     async fn handle_ctrl_msg(
         &mut self,
         msg: CtrlMsg,
+        paused: &mut bool,
     ) -> Result<Option<bool>, SummersetError> {
-        // TODO: fill this when more control message types added
         match msg {
             CtrlMsg::ResetState { durable } => {
                 self.handle_ctrl_reset_state(durable).await?;
                 Ok(Some(true))
             }
 
+            CtrlMsg::Pause => {
+                self.handle_ctrl_pause(paused)?;
+                Ok(None)
+            }
+
+            CtrlMsg::Resume => {
+                self.handle_ctrl_resume(paused)?;
+                Ok(None)
+            }
+
             _ => Ok(None), // ignore all other types
         }
     }
@@ -642,10 +674,11 @@ impl GenericReplica for SimplePushReplica {
         self.recover_from_log().await?;
 
         // main event loop
+        let mut paused = false;
         loop {
             tokio::select! {
                 // client request batch
-                req_batch = self.external_api.get_req_batch() => {
+                req_batch = self.external_api.get_req_batch(), if !paused => {
                     if let Err(e) = req_batch {
                         pf_error!(self.id; "error getting req batch: {}", e);
                         continue;
@@ -657,7 +690,7 @@ impl GenericReplica for SimplePushReplica {
                 },
 
                 // durable logging result
-                log_result = self.storage_hub.get_result() => {
+                log_result = self.storage_hub.get_result(), if !paused => {
                     if let Err(e) = log_result {
                         pf_error!(self.id; "error getting log result: {}", e);
                         continue;
@@ -669,7 +702,7 @@ impl GenericReplica for SimplePushReplica {
                 },
 
                 // message from peer
-                msg = self.transport_hub.recv_msg() => {
+                msg = self.transport_hub.recv_msg(), if !paused => {
                     if let Err(e) = msg {
                         pf_error!(self.id; "error receiving peer msg: {}", e);
                         continue;
@@ -691,7 +724,7 @@ impl GenericReplica for SimplePushReplica {
                 }
 
                 // state machine execution result
-                cmd_result = self.state_machine.get_result() => {
+                cmd_result = self.state_machine.get_result(), if !paused => {
                     if let Err(e) = cmd_result {
                         pf_error!(self.id; "error getting cmd result: {}", e);
                         continue;
@@ -709,7 +742,7 @@ impl GenericReplica for SimplePushReplica {
                         continue;
                     }
                     let ctrl_msg = ctrl_msg.unwrap();
-                    match self.handle_ctrl_msg(ctrl_msg).await {
+                    match self.handle_ctrl_msg(ctrl_msg, &mut paused).await {
                         Ok(terminate) => {
                             if let Some(restart) = terminate {
                                 pf_warn!(
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index a3c1ab64..d5b1b8cd 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -27,9 +27,11 @@ lazy_static! {
     static ref ALL_TESTS: Vec<(&'static str, bool)> = vec![
         ("primitive_ops", true),
         ("client_reconnect", true),
-        ("leader_node_reset", true),
         ("non_leader_reset", true),
-        ("two_nodes_reset", false)
+        ("leader_node_reset", true),
+        ("two_nodes_reset", false),
+        ("non_leader_pause", false),
+        ("leader_node_pause", false),
     ];
 }
 
@@ -246,7 +248,7 @@ impl ClientTester {
     ) -> Result<(), SummersetError> {
         let ctrl_stub = self.driver.ctrl_stub();
 
-        // send ResetServer request to manager
+        // send ResetServers request to manager
         let req = CtrlRequest::ResetServers { servers, durable };
         let mut sent = ctrl_stub.send_req(Some(&req))?;
         while !sent {
@@ -261,21 +263,69 @@ impl ClientTester {
         }
     }
 
+    /// Pauses some server(s) in the cluster.
+    async fn pause_servers(
+        &mut self,
+        servers: HashSet<ReplicaId>,
+    ) -> Result<(), SummersetError> {
+        let ctrl_stub = self.driver.ctrl_stub();
+
+        // send PauseServers request to manager
+        let req = CtrlRequest::PauseServers { servers };
+        let mut sent = ctrl_stub.send_req(Some(&req))?;
+        while !sent {
+            sent = ctrl_stub.send_req(None)?;
+        }
+
+        // wait for reply from manager
+        let reply = ctrl_stub.recv_reply().await?;
+        match reply {
+            CtrlReply::PauseServers { .. } => Ok(()),
+            _ => logged_err!(self.driver.id; "unexpected control reply type"),
+        }
+    }
+
+    /// Resume some server(s) in the cluster.
+    #[allow(dead_code)]
+    async fn resume_servers(
+        &mut self,
+        servers: HashSet<ReplicaId>,
+    ) -> Result<(), SummersetError> {
+        let ctrl_stub = self.driver.ctrl_stub();
+
+        // send ResumeServers request to manager
+        let req = CtrlRequest::ResumeServers { servers };
+        let mut sent = ctrl_stub.send_req(Some(&req))?;
+        while !sent {
+            sent = ctrl_stub.send_req(None)?;
+        }
+
+        // wait for reply from manager
+        let reply = ctrl_stub.recv_reply().await?;
+        match reply {
+            CtrlReply::ResumeServers { .. } => Ok(()),
+            _ => logged_err!(self.driver.id; "unexpected control reply type"),
+        }
+    }
+
     /// Runs the individual correctness test.
     async fn do_test_by_name(
         &mut self,
         name: &str,
     ) -> Result<(), SummersetError> {
         // reset everything to initial state at the start of each test
-        self.reset_servers(HashSet::new(), false).await?;
+        // self.reset_servers(HashSet::new(), false).await?;
+        // time::sleep(Duration::from_secs(1)).await;
         self.driver.connect().await?;
 
         let result = match name {
             "primitive_ops" => self.test_primitive_ops().await,
             "client_reconnect" => self.test_client_reconnect().await,
-            "leader_node_reset" => self.test_leader_node_reset().await,
             "non_leader_reset" => self.test_non_leader_reset().await,
+            "leader_node_reset" => self.test_leader_node_reset().await,
             "two_nodes_reset" => self.test_two_nodes_reset().await,
+            "non_leader_pause" => self.test_non_leader_pause().await,
+            "leader_node_pause" => self.test_leader_node_pause().await,
             _ => {
                 return logged_err!(self.driver.id; "unrecognized test name '{}'",
                                                    name);
@@ -361,12 +411,12 @@ impl ClientTester {
         Ok(())
     }
 
-    /// Single leader replica node crashes and restarts.
-    async fn test_leader_node_reset(&mut self) -> Result<(), SummersetError> {
+    /// Single non-leader replica node crashes and restarts.
+    async fn test_non_leader_reset(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
         self.checked_put("Jose", &v, Some(None)).await?;
         for (s, is_leader) in self.query_servers().await? {
-            if is_leader {
+            if !is_leader {
                 self.driver.leave(false).await?;
                 self.reset_servers(HashSet::from([s]), true).await?;
                 time::sleep(Duration::from_millis(500)).await;
@@ -379,11 +429,11 @@ impl ClientTester {
     }
 
     /// Single leader replica node crashes and restarts.
-    async fn test_non_leader_reset(&mut self) -> Result<(), SummersetError> {
+    async fn test_leader_node_reset(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
         self.checked_put("Jose", &v, Some(None)).await?;
         for (s, is_leader) in self.query_servers().await? {
-            if !is_leader {
+            if is_leader {
                 self.driver.leave(false).await?;
                 self.reset_servers(HashSet::from([s]), true).await?;
                 time::sleep(Duration::from_millis(500)).await;
@@ -423,4 +473,44 @@ impl ClientTester {
         }
         Ok(())
     }
+
+    /// Single non-leader replica node paused.
+    async fn test_non_leader_pause(&mut self) -> Result<(), SummersetError> {
+        let v0 = Self::gen_rand_string(8);
+        self.checked_put("Jose", &v0, Some(None)).await?;
+        time::sleep(Duration::from_millis(300)).await;
+        for (s, is_leader) in self.query_servers().await? {
+            if !is_leader {
+                self.driver.leave(false).await?;
+                self.pause_servers(HashSet::from([s])).await?;
+                time::sleep(Duration::from_secs(1)).await;
+                self.driver.connect().await?;
+                self.checked_get("Jose", Some(Some(&v0))).await?;
+                let v1 = Self::gen_rand_string(8);
+                self.checked_put("Jose", &v1, Some(Some(&v0))).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    /// Single leader replica node paused.
+    async fn test_leader_node_pause(&mut self) -> Result<(), SummersetError> {
+        let v0 = Self::gen_rand_string(8);
+        self.checked_put("Jose", &v0, Some(None)).await?;
+        time::sleep(Duration::from_millis(300)).await;
+        for (s, is_leader) in self.query_servers().await? {
+            if is_leader {
+                self.driver.leave(false).await?;
+                self.pause_servers(HashSet::from([s])).await?;
+                time::sleep(Duration::from_secs(1)).await;
+                self.driver.connect().await?;
+                self.checked_get("Jose", Some(Some(&v0))).await?;
+                let v1 = Self::gen_rand_string(8);
+                self.checked_put("Jose", &v1, Some(Some(&v0))).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
 }

From 5d2273af52d6d1e5812e1f17e220ac30e65e0818 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 17 Sep 2023 12:16:41 -0500
Subject: [PATCH 47/89] minor updates to tester client

---
 summerset_client/src/clients/tester.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index d5b1b8cd..e3c27cc0 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -314,8 +314,8 @@ impl ClientTester {
         name: &str,
     ) -> Result<(), SummersetError> {
         // reset everything to initial state at the start of each test
-        // self.reset_servers(HashSet::new(), false).await?;
-        // time::sleep(Duration::from_secs(1)).await;
+        self.reset_servers(HashSet::new(), false).await?;
+        time::sleep(Duration::from_secs(1)).await;
         self.driver.connect().await?;
 
         let result = match name {

From e8bb281d2186579389de8127e3139580d286c0f0 Mon Sep 17 00:00:00 2001
From: josehu <josehu@node0.smr-eval.advosuwmadison-pg0.wisc.cloudlab.us>
Date: Sun, 17 Sep 2023 21:09:13 +0000
Subject: [PATCH 48/89] fix MultiPaxos prepare reply voted bug

---
 src/manager/clusman.rs                 |  18 +++-
 src/manager/reactor.rs                 |  16 +++-
 src/protocols/crossword.rs             |  26 +++++-
 src/protocols/multipaxos.rs            |  48 ++++++++--
 src/protocols/rep_nothing.rs           |  13 ++-
 src/protocols/rs_paxos.rs              |  26 +++++-
 src/protocols/simple_push.rs           |  13 ++-
 src/server/storage.rs                  |  16 ++--
 summerset_client/src/clients/tester.rs | 121 +++++++++++++++++++------
 9 files changed, 230 insertions(+), 67 deletions(-)

diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index 0eeb1f1f..7b08e4d9 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -276,10 +276,22 @@ impl ClusterManager {
         let servers: HashMap<ReplicaId, (SocketAddr, bool)> = self
             .server_info
             .iter()
-            .map(|(&server, info)| (server, (info.api_addr, info.is_leader)))
+            .filter_map(|(&server, info)| {
+                if info.is_paused {
+                    None // ignore paused servers
+                } else {
+                    Some((server, (info.api_addr, info.is_leader)))
+                }
+            })
             .collect();
-        self.client_reactor
-            .send_reply(CtrlReply::QueryInfo { servers }, client)
+
+        self.client_reactor.send_reply(
+            CtrlReply::QueryInfo {
+                population: self.population,
+                servers,
+            },
+            client,
+        )
     }
 
     /// Handler of client ResetServers request.
diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs
index 4df04e36..3273c54f 100644
--- a/src/manager/reactor.rs
+++ b/src/manager/reactor.rs
@@ -54,6 +54,8 @@ pub enum CtrlRequest {
 pub enum CtrlReply {
     /// Reply to server info query.
     QueryInfo {
+        /// Number of replicas in cluster.
+        population: u8,
         /// Map from replica ID -> (addr, is_leader).
         servers: HashMap<ReplicaId, (SocketAddr, bool)>,
     },
@@ -466,6 +468,7 @@ mod reactor_tests {
             // send reply to client
             reactor.send_reply(
                 CtrlReply::QueryInfo {
+                    population: 2,
                     servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
                         (0, ("127.0.0.1:53700".parse()?, true)),
                         (1, ("127.0.0.1:53701".parse()?, false)),
@@ -485,6 +488,7 @@ mod reactor_tests {
         assert_eq!(
             ctrl_stub.recv_reply().await?,
             CtrlReply::QueryInfo {
+                population: 2,
                 servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
                     (0, ("127.0.0.1:53700".parse()?, true)),
                     (1, ("127.0.0.1:53701".parse()?, false)),
@@ -510,6 +514,7 @@ mod reactor_tests {
             assert_eq!(
                 ctrl_stub.recv_reply().await?,
                 CtrlReply::QueryInfo {
+                    population: 2,
                     servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
                         (0, ("127.0.0.1:54700".parse()?, true)),
                         (1, ("127.0.0.1:54701".parse()?, false)),
@@ -530,9 +535,10 @@ mod reactor_tests {
             assert_eq!(
                 ctrl_stub.recv_reply().await?,
                 CtrlReply::QueryInfo {
+                    population: 2,
                     servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
-                        (0, ("127.0.0.1:54710".parse()?, true)),
-                        (1, ("127.0.0.1:54711".parse()?, false)),
+                        (0, ("127.0.0.1:54700".parse()?, true)),
+                        (1, ("127.0.0.1:54701".parse()?, false)),
                     ]),
                 }
             );
@@ -549,6 +555,7 @@ mod reactor_tests {
         // send reply to client
         reactor.send_reply(
             CtrlReply::QueryInfo {
+                population: 2,
                 servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
                     (0, ("127.0.0.1:54700".parse()?, true)),
                     (1, ("127.0.0.1:54701".parse()?, false)),
@@ -564,9 +571,10 @@ mod reactor_tests {
         // send reply to new client
         reactor.send_reply(
             CtrlReply::QueryInfo {
+                population: 2,
                 servers: HashMap::<ReplicaId, (SocketAddr, bool)>::from([
-                    (0, ("127.0.0.1:54710".parse()?, true)),
-                    (1, ("127.0.0.1:54711".parse()?, false)),
+                    (0, ("127.0.0.1:54700".parse()?, true)),
+                    (1, ("127.0.0.1:54701".parse()?, false)),
                 ]),
             },
             client2,
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index cdfe2517..529b35c8 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -1008,7 +1008,6 @@ impl CrosswordReplica {
     }
 
     /// Handler of Commit message from leader.
-    /// TODO: take care of missing/lost Commit messages
     fn handle_msg_commit(
         &mut self,
         peer: ReplicaId,
@@ -1143,7 +1142,10 @@ impl CrosswordReplica {
     /// Becomes a leader, sends self-initiated Prepare messages to followers
     /// for all in-progress instances, and starts broadcasting heartbeats.
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
-        assert!(!self.is_leader);
+        if self.is_leader {
+            return Ok(());
+        }
+
         self.is_leader = true; // this starts broadcasting heartbeats
         self.control_hub
             .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
@@ -1851,7 +1853,15 @@ impl GenericEndpoint for CrosswordClient {
 
         let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
-            CtrlReply::QueryInfo { servers } => {
+            CtrlReply::QueryInfo {
+                population,
+                servers,
+            } => {
+                // shift to a new server_id if current one not active
+                assert!(!servers.is_empty());
+                while !servers.contains_key(&self.server_id) {
+                    self.server_id = (self.server_id + 1) % population;
+                }
                 // establish connection to all servers
                 self.servers = servers
                     .into_iter()
@@ -1907,7 +1917,10 @@ impl GenericEndpoint for CrosswordClient {
                 .unwrap()
                 .send_req(req)
         } else {
-            Err(SummersetError("client not set up".into()))
+            Err(SummersetError(format!(
+                "server_id {} not in api_stubs",
+                self.server_id
+            )))
         }
     }
 
@@ -1938,7 +1951,10 @@ impl GenericEndpoint for CrosswordClient {
 
             Ok(reply)
         } else {
-            Err(SummersetError("client not set up".into()))
+            Err(SummersetError(format!(
+                "server_id {} not in api_stubs",
+                self.server_id
+            )))
         }
     }
 
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 759e7cf3..974b95d4 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -132,6 +132,9 @@ struct Instance {
     /// Batch of client requests.
     reqs: ReqBatch,
 
+    /// Highest ballot and associated value I have accepted.
+    voted: (Ballot, ReqBatch),
+
     /// Leader-side bookkeeping info.
     leader_bk: Option<LeaderBookkeeping>,
 
@@ -359,6 +362,7 @@ impl MultiPaxosReplica {
                 bal: 0,
                 status: Status::Null,
                 reqs: req_batch.clone(),
+                voted: (0, Vec::new()),
                 leader_bk: Some(LeaderBookkeeping {
                     prepare_acks: Bitmap::new(self.population, false),
                     prepare_max_bal: 0,
@@ -420,6 +424,7 @@ impl MultiPaxosReplica {
                                slot, inst.bal);
 
             // record update to largest accepted ballot and corresponding data
+            inst.voted = (inst.bal, req_batch.clone());
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
@@ -458,8 +463,8 @@ impl MultiPaxosReplica {
         pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}",
                            slot, self.insts[slot].bal);
         let inst = &self.insts[slot];
-        let voted = if inst.status >= Status::Accepting {
-            Some((inst.bal, inst.reqs.clone()))
+        let voted = if inst.voted.0 > 0 {
+            Some(inst.voted.clone())
         } else {
             None
         };
@@ -609,6 +614,7 @@ impl MultiPaxosReplica {
                     bal: 0,
                     status: Status::Null,
                     reqs: Vec::new(),
+                    voted: (0, Vec::new()),
                     leader_bk: None,
                     replica_bk: None,
                     external: false,
@@ -739,6 +745,7 @@ impl MultiPaxosReplica {
                     bal: 0,
                     status: Status::Null,
                     reqs: Vec::new(),
+                    voted: (0, Vec::new()),
                     leader_bk: None,
                     replica_bk: None,
                     external: false,
@@ -756,6 +763,7 @@ impl MultiPaxosReplica {
             self.bal_max_seen = ballot;
 
             // record update to largest prepare ballot
+            inst.voted = (ballot, reqs.clone());
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
@@ -829,7 +837,6 @@ impl MultiPaxosReplica {
     }
 
     /// Handler of Commit message from leader.
-    /// TODO: take care of missing/lost Commit messages
     fn handle_msg_commit(
         &mut self,
         peer: ReplicaId,
@@ -843,6 +850,7 @@ impl MultiPaxosReplica {
                 bal: 0,
                 status: Status::Null,
                 reqs: Vec::new(),
+                voted: (0, Vec::new()),
                 leader_bk: None,
                 replica_bk: None,
                 external: false,
@@ -958,7 +966,10 @@ impl MultiPaxosReplica {
     /// Becomes a leader, sends self-initiated Prepare messages to followers
     /// for all in-progress instances, and starts broadcasting heartbeats.
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
-        assert!(!self.is_leader);
+        if self.is_leader {
+            return Ok(());
+        }
+
         self.is_leader = true; // this starts broadcasting heartbeats
         self.control_hub
             .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
@@ -1120,12 +1131,12 @@ impl MultiPaxosReplica {
         paused: &mut bool,
     ) -> Result<(), SummersetError> {
         pf_warn!(self.id; "server got resume req");
-        *paused = false;
-        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
 
         // reset leader heartbeat timer
         self.kickoff_hb_hear_timer()?;
 
+        *paused = false;
+        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
         Ok(())
     }
 
@@ -1170,6 +1181,7 @@ impl MultiPaxosReplica {
                         bal: 0,
                         status: Status::Null,
                         reqs: Vec::new(),
+                        voted: (0, Vec::new()),
                         leader_bk: None,
                         replica_bk: None,
                         external: false,
@@ -1196,6 +1208,7 @@ impl MultiPaxosReplica {
                         bal: 0,
                         status: Status::Null,
                         reqs: Vec::new(),
+                        voted: (0, Vec::new()),
                         leader_bk: None,
                         replica_bk: None,
                         external: false,
@@ -1205,7 +1218,8 @@ impl MultiPaxosReplica {
                 let inst = &mut self.insts[slot];
                 inst.bal = ballot;
                 inst.status = Status::Accepting;
-                inst.reqs = reqs;
+                inst.reqs = reqs.clone();
+                inst.voted = (ballot, reqs);
                 // update bal_prepared and bal_max_seen
                 if self.bal_prepared < ballot {
                     self.bal_prepared = ballot;
@@ -1625,7 +1639,15 @@ impl GenericEndpoint for MultiPaxosClient {
 
         let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
-            CtrlReply::QueryInfo { servers } => {
+            CtrlReply::QueryInfo {
+                population,
+                servers,
+            } => {
+                // shift to a new server_id if current one not active
+                assert!(!servers.is_empty());
+                while !servers.contains_key(&self.server_id) {
+                    self.server_id = (self.server_id + 1) % population;
+                }
                 // establish connection to all servers
                 self.servers = servers
                     .into_iter()
@@ -1681,7 +1703,10 @@ impl GenericEndpoint for MultiPaxosClient {
                 .unwrap()
                 .send_req(req)
         } else {
-            Err(SummersetError("client not set up".into()))
+            Err(SummersetError(format!(
+                "server_id {} not in api_stubs",
+                self.server_id
+            )))
         }
     }
 
@@ -1712,7 +1737,10 @@ impl GenericEndpoint for MultiPaxosClient {
 
             Ok(reply)
         } else {
-            Err(SummersetError("client not set up".into()))
+            Err(SummersetError(format!(
+                "server_id {} not in api_stubs",
+                self.server_id
+            )))
         }
     }
 
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index 97469f5f..e5c6b0dd 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -612,8 +612,17 @@ impl GenericEndpoint for RepNothingClient {
 
         let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
-            CtrlReply::QueryInfo { servers } => {
-                // connect to the one with server ID in config
+            CtrlReply::QueryInfo {
+                population,
+                servers,
+            } => {
+                // find a server to connect to, starting from provided server_id
+                assert!(!servers.is_empty());
+                while !servers.contains_key(&self.config.server_id) {
+                    self.config.server_id =
+                        (self.config.server_id + 1) % population;
+                }
+                // connect to that server
                 pf_info!(self.id; "connecting to server {} '{}'...",
                                   self.config.server_id, servers[&self.config.server_id].0);
                 let api_stub = ClientApiStub::new_by_connect(
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index d963a7b8..151470c7 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -908,7 +908,6 @@ impl RSPaxosReplica {
     }
 
     /// Handler of Commit message from leader.
-    /// TODO: take care of missing/lost Commit messages
     fn handle_msg_commit(
         &mut self,
         peer: ReplicaId,
@@ -1043,7 +1042,10 @@ impl RSPaxosReplica {
     /// Becomes a leader, sends self-initiated Prepare messages to followers
     /// for all in-progress instances, and starts broadcasting heartbeats.
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
-        assert!(!self.is_leader);
+        if self.is_leader {
+            return Ok(());
+        }
+
         self.is_leader = true; // this starts broadcasting heartbeats
         self.control_hub
             .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
@@ -1744,7 +1746,15 @@ impl GenericEndpoint for RSPaxosClient {
 
         let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
-            CtrlReply::QueryInfo { servers } => {
+            CtrlReply::QueryInfo {
+                population,
+                servers,
+            } => {
+                // shift to a new server_id if current one not active
+                assert!(!servers.is_empty());
+                while !servers.contains_key(&self.server_id) {
+                    self.server_id = (self.server_id + 1) % population;
+                }
                 // establish connection to all servers
                 self.servers = servers
                     .into_iter()
@@ -1800,7 +1810,10 @@ impl GenericEndpoint for RSPaxosClient {
                 .unwrap()
                 .send_req(req)
         } else {
-            Err(SummersetError("client not set up".into()))
+            Err(SummersetError(format!(
+                "server_id {} not in api_stubs",
+                self.server_id
+            )))
         }
     }
 
@@ -1831,7 +1844,10 @@ impl GenericEndpoint for RSPaxosClient {
 
             Ok(reply)
         } else {
-            Err(SummersetError("client not set up".into()))
+            Err(SummersetError(format!(
+                "server_id {} not in api_stubs",
+                self.server_id
+            )))
         }
     }
 
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index c42be3bc..5d8baeec 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -839,8 +839,17 @@ impl GenericEndpoint for SimplePushClient {
 
         let reply = self.ctrl_stub.recv_reply().await?;
         match reply {
-            CtrlReply::QueryInfo { servers } => {
-                // connect to the one with server ID in config
+            CtrlReply::QueryInfo {
+                population,
+                servers,
+            } => {
+                // find a server to connect to, starting from provided server_id
+                assert!(!servers.is_empty());
+                while !servers.contains_key(&self.config.server_id) {
+                    self.config.server_id =
+                        (self.config.server_id + 1) % population;
+                }
+                // connect to that server
                 pf_info!(self.id; "connecting to server {} '{}'...",
                                   self.config.server_id, servers[&self.config.server_id].0);
                 let api_stub = ClientApiStub::new_by_connect(
diff --git a/src/server/storage.rs b/src/server/storage.rs
index 99809e65..d14bd2bf 100644
--- a/src/server/storage.rs
+++ b/src/server/storage.rs
@@ -203,12 +203,16 @@ where
         offset: usize,
     ) -> Result<(Option<Ent>, usize), SummersetError> {
         if offset + 8 > file_size {
-            pf_warn!(
-                me;
-                "read header end offset {} out of file bound {}",
-                offset + 8,
-                file_size
-            );
+            if offset < file_size {
+                // suppress warning if offset == file_size to avoid excessive
+                // log lines during recovery
+                pf_warn!(
+                    me;
+                    "read header end offset {} out of file bound {}",
+                    offset + 8,
+                    file_size
+                );
+            }
             return Ok((None, offset));
         }
 
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index e3c27cc0..7da4d5df 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -19,7 +19,7 @@ use tokio::time::{self, Duration};
 
 use summerset::{
     ReplicaId, GenericEndpoint, CommandResult, CtrlRequest, CtrlReply,
-    SummersetError, pf_error, logged_err, parsed_config,
+    SummersetError, pf_debug, pf_error, logged_err, parsed_config,
 };
 
 lazy_static! {
@@ -29,9 +29,10 @@ lazy_static! {
         ("client_reconnect", true),
         ("non_leader_reset", true),
         ("leader_node_reset", true),
-        ("two_nodes_reset", false),
+        ("two_nodes_reset", true),
         ("non_leader_pause", false),
         ("leader_node_pause", false),
+        ("node_pause_resume", false),
     ];
 }
 
@@ -109,13 +110,16 @@ impl ClientTester {
     }
 
     /// Issues a Get request and checks its reply value against given one if
-    /// not `None`. Retries in-place upon getting redirection error.
+    /// not `None`. Retries in-place upon getting redirection error. Retries
+    /// at most max_timeouts times upon getting timeouts.
     async fn checked_get(
         &mut self,
         key: &str,
         expect_value: Option<Option<&str>>,
+        max_timeouts: u8,
     ) -> Result<(), SummersetError> {
-        loop {
+        let mut timeouts = 0;
+        while timeouts <= max_timeouts {
             let result = self.driver.get(key).await?;
             match result {
                 DriverReply::Success { cmd_result, .. } => {
@@ -151,25 +155,36 @@ impl ClientTester {
                 }
 
                 DriverReply::Timeout => {
-                    return logged_err!(
+                    timeouts += 1;
+                    pf_debug!(
                         self.driver.id;
                         "client-side timeout {} ms",
                         self.timeout.as_millis()
-                    )
+                    );
                 }
             }
         }
+
+        logged_err!(
+            self.driver.id;
+            "client-side timeout {} ms {} times",
+            self.timeout.as_millis(),
+            max_timeouts
+        )
     }
 
     /// Issues a Put request and checks its reply old_value against given one
-    /// if not `None`. Retries in-place upon getting redirection error.
+    /// if not `None`. Retries in-place upon getting redirection error. Retries
+    /// at most max_timeouts times upon getting timeouts.
     async fn checked_put(
         &mut self,
         key: &str,
         value: &str,
         expect_old_value: Option<Option<&str>>,
+        max_timeouts: u8,
     ) -> Result<(), SummersetError> {
-        loop {
+        let mut timeouts = 0;
+        while timeouts <= max_timeouts {
             let result = self.driver.put(key, value).await?;
             match result {
                 DriverReply::Success { cmd_result, .. } => {
@@ -206,14 +221,22 @@ impl ClientTester {
                 }
 
                 DriverReply::Timeout => {
-                    return logged_err!(
+                    timeouts += 1;
+                    pf_debug!(
                         self.driver.id;
                         "client-side timeout {} ms",
                         self.timeout.as_millis()
-                    )
+                    );
                 }
             }
         }
+
+        logged_err!(
+            self.driver.id;
+            "client-side timeout {} ms {} times",
+            self.timeout.as_millis(),
+            max_timeouts
+        )
     }
 
     /// Query the list of servers in the cluster. Returns a map from replica ID
@@ -233,7 +256,7 @@ impl ClientTester {
         // wait for reply from manager
         let reply = ctrl_stub.recv_reply().await?;
         match reply {
-            CtrlReply::QueryInfo { servers } => {
+            CtrlReply::QueryInfo { servers, .. } => {
                 Ok(servers.into_iter().map(|(id, info)| (id, info.1)).collect())
             }
             _ => logged_err!(self.driver.id; "unexpected control reply type"),
@@ -326,6 +349,7 @@ impl ClientTester {
             "two_nodes_reset" => self.test_two_nodes_reset().await,
             "non_leader_pause" => self.test_non_leader_pause().await,
             "leader_node_pause" => self.test_leader_node_pause().await,
+            "node_pause_resume" => self.test_node_pause_resume().await,
             _ => {
                 return logged_err!(self.driver.id; "unrecognized test name '{}'",
                                                    name);
@@ -391,37 +415,37 @@ impl ClientTester {
 impl ClientTester {
     /// Basic primitive operations.
     async fn test_primitive_ops(&mut self) -> Result<(), SummersetError> {
-        self.checked_get("Jose", Some(None)).await?;
+        self.checked_get("Jose", Some(None), 0).await?;
         let v0 = Self::gen_rand_string(8);
-        self.checked_put("Jose", &v0, Some(None)).await?;
-        self.checked_get("Jose", Some(Some(&v0))).await?;
+        self.checked_put("Jose", &v0, Some(None), 0).await?;
+        self.checked_get("Jose", Some(Some(&v0)), 0).await?;
         let v1 = Self::gen_rand_string(16);
-        self.checked_put("Jose", &v1, Some(Some(&v0))).await?;
-        self.checked_get("Jose", Some(Some(&v1))).await?;
+        self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?;
+        self.checked_get("Jose", Some(Some(&v1)), 0).await?;
         Ok(())
     }
 
     /// Client leaves and reconnects.
     async fn test_client_reconnect(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
-        self.checked_put("Jose", &v, Some(None)).await?;
+        self.checked_put("Jose", &v, Some(None), 0).await?;
         self.driver.leave(false).await?;
         self.driver.connect().await?;
-        self.checked_get("Jose", Some(Some(&v))).await?;
+        self.checked_get("Jose", Some(Some(&v)), 0).await?;
         Ok(())
     }
 
     /// Single non-leader replica node crashes and restarts.
     async fn test_non_leader_reset(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
-        self.checked_put("Jose", &v, Some(None)).await?;
+        self.checked_put("Jose", &v, Some(None), 0).await?;
         for (s, is_leader) in self.query_servers().await? {
             if !is_leader {
                 self.driver.leave(false).await?;
                 self.reset_servers(HashSet::from([s]), true).await?;
                 time::sleep(Duration::from_millis(500)).await;
                 self.driver.connect().await?;
-                self.checked_get("Jose", Some(Some(&v))).await?;
+                self.checked_get("Jose", Some(Some(&v)), 0).await?;
                 break;
             }
         }
@@ -431,14 +455,14 @@ impl ClientTester {
     /// Single leader replica node crashes and restarts.
     async fn test_leader_node_reset(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
-        self.checked_put("Jose", &v, Some(None)).await?;
+        self.checked_put("Jose", &v, Some(None), 0).await?;
         for (s, is_leader) in self.query_servers().await? {
             if is_leader {
                 self.driver.leave(false).await?;
                 self.reset_servers(HashSet::from([s]), true).await?;
                 time::sleep(Duration::from_millis(500)).await;
                 self.driver.connect().await?;
-                self.checked_get("Jose", Some(Some(&v))).await?;
+                self.checked_get("Jose", Some(Some(&v)), 0).await?;
                 break;
             }
         }
@@ -448,7 +472,7 @@ impl ClientTester {
     /// Two replica nodes (leader + non-leader) crash and restart.
     async fn test_two_nodes_reset(&mut self) -> Result<(), SummersetError> {
         let v = Self::gen_rand_string(8);
-        self.checked_put("Jose", &v, Some(None)).await?;
+        self.checked_put("Jose", &v, Some(None), 0).await?;
         let mut resets = HashSet::new();
         let (mut l, mut nl) = (false, false);
         for (s, is_leader) in self.query_servers().await? {
@@ -469,7 +493,7 @@ impl ClientTester {
             self.reset_servers(resets, true).await?;
             time::sleep(Duration::from_millis(500)).await;
             self.driver.connect().await?;
-            self.checked_get("Jose", Some(Some(&v))).await?;
+            self.checked_get("Jose", Some(Some(&v)), 0).await?;
         }
         Ok(())
     }
@@ -477,7 +501,7 @@ impl ClientTester {
     /// Single non-leader replica node paused.
     async fn test_non_leader_pause(&mut self) -> Result<(), SummersetError> {
         let v0 = Self::gen_rand_string(8);
-        self.checked_put("Jose", &v0, Some(None)).await?;
+        self.checked_put("Jose", &v0, Some(None), 0).await?;
         time::sleep(Duration::from_millis(300)).await;
         for (s, is_leader) in self.query_servers().await? {
             if !is_leader {
@@ -485,9 +509,9 @@ impl ClientTester {
                 self.pause_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
                 self.driver.connect().await?;
-                self.checked_get("Jose", Some(Some(&v0))).await?;
+                self.checked_get("Jose", Some(Some(&v0)), 0).await?;
                 let v1 = Self::gen_rand_string(8);
-                self.checked_put("Jose", &v1, Some(Some(&v0))).await?;
+                self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?;
                 break;
             }
         }
@@ -497,7 +521,7 @@ impl ClientTester {
     /// Single leader replica node paused.
     async fn test_leader_node_pause(&mut self) -> Result<(), SummersetError> {
         let v0 = Self::gen_rand_string(8);
-        self.checked_put("Jose", &v0, Some(None)).await?;
+        self.checked_put("Jose", &v0, Some(None), 0).await?;
         time::sleep(Duration::from_millis(300)).await;
         for (s, is_leader) in self.query_servers().await? {
             if is_leader {
@@ -505,9 +529,46 @@ impl ClientTester {
                 self.pause_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
                 self.driver.connect().await?;
-                self.checked_get("Jose", Some(Some(&v0))).await?;
+                self.checked_get("Jose", Some(Some(&v0)), 0).await?;
                 let v1 = Self::gen_rand_string(8);
-                self.checked_put("Jose", &v1, Some(Some(&v0))).await?;
+                self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    /// Leader replica node paused and then resumed, twice.
+    async fn test_node_pause_resume(&mut self) -> Result<(), SummersetError> {
+        let v0 = Self::gen_rand_string(8);
+        self.checked_put("Jose", &v0, Some(None), 0).await?;
+        time::sleep(Duration::from_millis(300)).await;
+        for (s, is_leader) in self.query_servers().await? {
+            if is_leader {
+                self.driver.leave(false).await?;
+                self.pause_servers(HashSet::from([s])).await?;
+                time::sleep(Duration::from_secs(1)).await;
+                self.driver.connect().await?;
+                let v1 = Self::gen_rand_string(8);
+                self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?;
+                self.driver.leave(false).await?;
+                self.resume_servers(HashSet::from([s])).await?;
+                time::sleep(Duration::from_secs(1)).await;
+                self.driver.connect().await?;
+                let v2 = Self::gen_rand_string(8);
+                self.checked_put("Jose", &v2, Some(Some(&v1)), 1).await?;
+                self.driver.leave(false).await?;
+                self.pause_servers(HashSet::from([s])).await?;
+                time::sleep(Duration::from_secs(1)).await;
+                self.driver.connect().await?;
+                let v3 = Self::gen_rand_string(8);
+                self.checked_put("Jose", &v3, Some(Some(&v2)), 0).await?;
+                self.driver.leave(false).await?;
+                self.resume_servers(HashSet::from([s])).await?;
+                time::sleep(Duration::from_secs(1)).await;
+                self.driver.connect().await?;
+                let v4 = Self::gen_rand_string(8);
+                self.checked_put("Jose", &v4, Some(Some(&v3)), 1).await?;
                 break;
             }
         }

From 694b3b1dcfa50286a581d69e61aae9b069797564 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@gmail.com>
Date: Sun, 17 Sep 2023 21:46:17 +0000
Subject: [PATCH 49/89] fix prepare reply voted bug for other Paxos variants

---
 src/protocols/crossword.rs  | 165 ++++++++++++++++--------------------
 src/protocols/multipaxos.rs |  84 ++++++------------
 src/protocols/rs_paxos.rs   | 137 +++++++++++++-----------------
 3 files changed, 154 insertions(+), 232 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 529b35c8..7bb12ff8 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -140,6 +140,9 @@ struct Instance {
     /// Shards of a batch of client requests.
     reqs_cw: RSCodeword<ReqBatch>,
 
+    /// Highest ballot and associated value I have accepted.
+    voted: (Ballot, RSCodeword<ReqBatch>),
+
     /// Leader-side bookkeeping info.
     leader_bk: Option<LeaderBookkeeping>,
 
@@ -270,6 +273,28 @@ pub struct CrosswordReplica {
 }
 
 impl CrosswordReplica {
+    /// Create an empty null instance.
+    fn null_instance(&self) -> Result<Instance, SummersetError> {
+        Ok(Instance {
+            bal: 0,
+            status: Status::Null,
+            reqs_cw: RSCodeword::<ReqBatch>::from_null(
+                self.quorum_cnt,
+                self.population - self.quorum_cnt,
+            )?,
+            voted: (
+                0,
+                RSCodeword::<ReqBatch>::from_null(
+                    self.quorum_cnt,
+                    self.population - self.quorum_cnt,
+                )?,
+            ),
+            leader_bk: None,
+            replica_bk: None,
+            external: false,
+        })
+    }
+
     /// Compose a unique ballot number from base.
     fn make_unique_ballot(&self, base: u64) -> Ballot {
         ((base << 8) | ((self.id + 1) as u64)) as Ballot
@@ -428,18 +453,14 @@ impl CrosswordReplica {
                 accept_acks: HashMap::new(),
             });
         } else {
-            let new_inst = Instance {
-                bal: 0,
-                status: Status::Null,
-                reqs_cw,
-                leader_bk: Some(LeaderBookkeeping {
-                    prepare_acks: Bitmap::new(self.population, false),
-                    prepare_max_bal: 0,
-                    accept_acks: HashMap::new(),
-                }),
-                replica_bk: None,
-                external: true,
-            };
+            let mut new_inst = self.null_instance()?;
+            new_inst.reqs_cw = reqs_cw;
+            new_inst.leader_bk = Some(LeaderBookkeeping {
+                prepare_acks: Bitmap::new(self.population, false),
+                prepare_max_bal: 0,
+                accept_acks: HashMap::new(),
+            });
+            new_inst.external = true;
             self.insts.push(new_inst);
         }
 
@@ -493,6 +514,18 @@ impl CrosswordReplica {
                                slot, inst.bal);
 
             // record update to largest accepted ballot and corresponding data
+            let subset_copy = inst.reqs_cw.subset_copy(
+                Bitmap::from(
+                    self.population,
+                    Self::shards_for_replica(
+                        self.id,
+                        self.population,
+                        self.config.shards_per_replica,
+                    ),
+                ),
+                false,
+            )?;
+            inst.voted = (inst.bal, subset_copy.clone());
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
@@ -500,17 +533,7 @@ impl CrosswordReplica {
                         slot,
                         ballot: inst.bal,
                         // persist only some shards on myself
-                        reqs_cw: inst.reqs_cw.subset_copy(
-                            Bitmap::from(
-                                self.population,
-                                Self::shards_for_replica(
-                                    self.id,
-                                    self.population,
-                                    self.config.shards_per_replica,
-                                ),
-                            ),
-                            false,
-                        )?,
+                        reqs_cw: subset_copy,
                     },
                     sync: self.config.logger_sync,
                 },
@@ -558,8 +581,8 @@ impl CrosswordReplica {
         pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}",
                            slot, self.insts[slot].bal);
         let inst = &self.insts[slot];
-        let voted = if inst.status >= Status::Accepting {
-            Some((inst.bal, inst.reqs_cw.clone()))
+        let voted = if inst.voted.0 > 0 {
+            Some(inst.voted.clone())
         } else {
             None
         };
@@ -716,17 +739,7 @@ impl CrosswordReplica {
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
             while self.insts.len() <= slot {
-                self.insts.push(Instance {
-                    bal: 0,
-                    status: Status::Null,
-                    reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                        self.quorum_cnt,
-                        self.population - self.quorum_cnt,
-                    )?,
-                    leader_bk: None,
-                    replica_bk: None,
-                    external: false,
-                });
+                self.insts.push(self.null_instance()?);
             }
             let inst = &mut self.insts[slot];
             assert!(inst.bal <= ballot);
@@ -819,23 +832,25 @@ impl CrosswordReplica {
                 }
 
                 // record update to largest accepted ballot and corresponding data
+                let subset_copy = inst.reqs_cw.subset_copy(
+                    Bitmap::from(
+                        self.population,
+                        Self::shards_for_replica(
+                            self.id,
+                            self.population,
+                            self.config.shards_per_replica,
+                        ),
+                    ),
+                    false,
+                )?;
+                inst.voted = (ballot, subset_copy.clone());
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Accepting),
                     LogAction::Append {
                         entry: LogEntry::AcceptData {
                             slot,
                             ballot,
-                            reqs_cw: inst.reqs_cw.subset_copy(
-                                Bitmap::from(
-                                    self.population,
-                                    Self::shards_for_replica(
-                                        self.id,
-                                        self.population,
-                                        self.config.shards_per_replica,
-                                    ),
-                                ),
-                                false,
-                            )?,
+                            reqs_cw: subset_copy,
                         },
                         sync: self.config.logger_sync,
                     },
@@ -890,17 +905,7 @@ impl CrosswordReplica {
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
             while self.insts.len() <= slot {
-                self.insts.push(Instance {
-                    bal: 0,
-                    status: Status::Null,
-                    reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                        self.quorum_cnt,
-                        self.population - self.quorum_cnt,
-                    )?,
-                    leader_bk: None,
-                    replica_bk: None,
-                    external: false,
-                });
+                self.insts.push(self.null_instance()?);
             }
             let inst = &mut self.insts[slot];
             assert!(inst.bal <= ballot);
@@ -914,6 +919,7 @@ impl CrosswordReplica {
             self.bal_max_seen = ballot;
 
             // record update to largest prepare ballot
+            inst.voted = (ballot, inst.reqs_cw.clone());
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
@@ -1017,17 +1023,7 @@ impl CrosswordReplica {
 
         // locate instance in memory, filling in null instances if needed
         while self.insts.len() <= slot {
-            self.insts.push(Instance {
-                bal: 0,
-                status: Status::Null,
-                reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                    self.quorum_cnt,
-                    self.population - self.quorum_cnt,
-                )?,
-                leader_bk: None,
-                replica_bk: None,
-                external: false,
-            });
+            self.insts.push(self.null_instance()?);
         }
         let inst = &mut self.insts[slot];
 
@@ -1307,12 +1303,12 @@ impl CrosswordReplica {
         paused: &mut bool,
     ) -> Result<(), SummersetError> {
         pf_warn!(self.id; "server got resume req");
-        *paused = false;
-        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
 
         // reset leader heartbeat timer
         self.kickoff_hb_hear_timer()?;
 
+        *paused = false;
+        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
         Ok(())
     }
 
@@ -1353,17 +1349,7 @@ impl CrosswordReplica {
             LogEntry::PrepareBal { slot, ballot } => {
                 // locate instance in memory, filling in null instances if needed
                 while self.insts.len() <= slot {
-                    self.insts.push(Instance {
-                        bal: 0,
-                        status: Status::Null,
-                        reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                            self.quorum_cnt,
-                            self.population - self.quorum_cnt,
-                        )?,
-                        leader_bk: None,
-                        replica_bk: None,
-                        external: false,
-                    });
+                    self.insts.push(self.null_instance()?);
                 }
                 // update instance state
                 let inst = &mut self.insts[slot];
@@ -1386,23 +1372,14 @@ impl CrosswordReplica {
             } => {
                 // locate instance in memory, filling in null instances if needed
                 while self.insts.len() <= slot {
-                    self.insts.push(Instance {
-                        bal: 0,
-                        status: Status::Null,
-                        reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                            self.quorum_cnt,
-                            self.population - self.quorum_cnt,
-                        )?,
-                        leader_bk: None,
-                        replica_bk: None,
-                        external: false,
-                    });
+                    self.insts.push(self.null_instance()?);
                 }
                 // update instance state
                 let inst = &mut self.insts[slot];
                 inst.bal = ballot;
                 inst.status = Status::Accepting;
-                inst.reqs_cw = reqs_cw;
+                inst.reqs_cw = reqs_cw.clone();
+                inst.voted = (ballot, reqs_cw);
                 // update bal_prepared and bal_max_seen
                 if self.bal_prepared < ballot {
                     self.bal_prepared = ballot;
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 974b95d4..e5024071 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -262,6 +262,19 @@ pub struct MultiPaxosReplica {
 }
 
 impl MultiPaxosReplica {
+    /// Create an empty null instance.
+    fn null_instance(&self) -> Instance {
+        Instance {
+            bal: 0,
+            status: Status::Null,
+            reqs: Vec::new(),
+            voted: (0, Vec::new()),
+            leader_bk: None,
+            replica_bk: None,
+            external: false,
+        }
+    }
+
     /// Compose a unique ballot number from base.
     fn make_unique_ballot(&self, base: u64) -> Ballot {
         ((base << 8) | ((self.id + 1) as u64)) as Ballot
@@ -358,19 +371,14 @@ impl MultiPaxosReplica {
             }
         }
         if slot == self.insts.len() {
-            let new_inst = Instance {
-                bal: 0,
-                status: Status::Null,
-                reqs: req_batch.clone(),
-                voted: (0, Vec::new()),
-                leader_bk: Some(LeaderBookkeeping {
-                    prepare_acks: Bitmap::new(self.population, false),
-                    prepare_max_bal: 0,
-                    accept_acks: Bitmap::new(self.population, false),
-                }),
-                replica_bk: None,
-                external: true,
-            };
+            let mut new_inst = self.null_instance();
+            new_inst.reqs = req_batch.clone();
+            new_inst.leader_bk = Some(LeaderBookkeeping {
+                prepare_acks: Bitmap::new(self.population, false),
+                prepare_max_bal: 0,
+                accept_acks: Bitmap::new(self.population, false),
+            });
+            new_inst.external = true;
             self.insts.push(new_inst);
         }
 
@@ -610,15 +618,7 @@ impl MultiPaxosReplica {
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
             while self.insts.len() <= slot {
-                self.insts.push(Instance {
-                    bal: 0,
-                    status: Status::Null,
-                    reqs: Vec::new(),
-                    voted: (0, Vec::new()),
-                    leader_bk: None,
-                    replica_bk: None,
-                    external: false,
-                });
+                self.insts.push(self.null_instance());
             }
             let inst = &mut self.insts[slot];
             assert!(inst.bal <= ballot);
@@ -741,15 +741,7 @@ impl MultiPaxosReplica {
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
             while self.insts.len() <= slot {
-                self.insts.push(Instance {
-                    bal: 0,
-                    status: Status::Null,
-                    reqs: Vec::new(),
-                    voted: (0, Vec::new()),
-                    leader_bk: None,
-                    replica_bk: None,
-                    external: false,
-                });
+                self.insts.push(self.null_instance());
             }
             let inst = &mut self.insts[slot];
             assert!(inst.bal <= ballot);
@@ -846,15 +838,7 @@ impl MultiPaxosReplica {
 
         // locate instance in memory, filling in null instances if needed
         while self.insts.len() <= slot {
-            self.insts.push(Instance {
-                bal: 0,
-                status: Status::Null,
-                reqs: Vec::new(),
-                voted: (0, Vec::new()),
-                leader_bk: None,
-                replica_bk: None,
-                external: false,
-            });
+            self.insts.push(self.null_instance());
         }
         let inst = &mut self.insts[slot];
 
@@ -1177,15 +1161,7 @@ impl MultiPaxosReplica {
             LogEntry::PrepareBal { slot, ballot } => {
                 // locate instance in memory, filling in null instances if needed
                 while self.insts.len() <= slot {
-                    self.insts.push(Instance {
-                        bal: 0,
-                        status: Status::Null,
-                        reqs: Vec::new(),
-                        voted: (0, Vec::new()),
-                        leader_bk: None,
-                        replica_bk: None,
-                        external: false,
-                    });
+                    self.insts.push(self.null_instance());
                 }
                 // update instance state
                 let inst = &mut self.insts[slot];
@@ -1204,15 +1180,7 @@ impl MultiPaxosReplica {
             LogEntry::AcceptData { slot, ballot, reqs } => {
                 // locate instance in memory, filling in null instances if needed
                 while self.insts.len() <= slot {
-                    self.insts.push(Instance {
-                        bal: 0,
-                        status: Status::Null,
-                        reqs: Vec::new(),
-                        voted: (0, Vec::new()),
-                        leader_bk: None,
-                        replica_bk: None,
-                        external: false,
-                    });
+                    self.insts.push(self.null_instance());
                 }
                 // update instance state
                 let inst = &mut self.insts[slot];
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 151470c7..f0320a9e 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -134,6 +134,9 @@ struct Instance {
     /// Shards of a batch of client requests.
     reqs_cw: RSCodeword<ReqBatch>,
 
+    /// Highest ballot and associated value I have accepted.
+    voted: (Ballot, RSCodeword<ReqBatch>),
+
     /// Leader-side bookkeeping info.
     leader_bk: Option<LeaderBookkeeping>,
 
@@ -264,6 +267,28 @@ pub struct RSPaxosReplica {
 }
 
 impl RSPaxosReplica {
+    /// Create an empty null instance.
+    fn null_instance(&self) -> Result<Instance, SummersetError> {
+        Ok(Instance {
+            bal: 0,
+            status: Status::Null,
+            reqs_cw: RSCodeword::<ReqBatch>::from_null(
+                self.quorum_cnt,
+                self.population - self.quorum_cnt,
+            )?,
+            voted: (
+                0,
+                RSCodeword::<ReqBatch>::from_null(
+                    self.quorum_cnt,
+                    self.population - self.quorum_cnt,
+                )?,
+            ),
+            leader_bk: None,
+            replica_bk: None,
+            external: false,
+        })
+    }
+
     /// Compose a unique ballot number from base.
     fn make_unique_ballot(&self, base: u64) -> Ballot {
         ((base << 8) | ((self.id + 1) as u64)) as Ballot
@@ -370,18 +395,14 @@ impl RSPaxosReplica {
                 accept_acks: Bitmap::new(self.population, false),
             });
         } else {
-            let new_inst = Instance {
-                bal: 0,
-                status: Status::Null,
-                reqs_cw,
-                leader_bk: Some(LeaderBookkeeping {
-                    prepare_acks: Bitmap::new(self.population, false),
-                    prepare_max_bal: 0,
-                    accept_acks: Bitmap::new(self.population, false),
-                }),
-                replica_bk: None,
-                external: true,
-            };
+            let mut new_inst = self.null_instance()?;
+            new_inst.reqs_cw = reqs_cw;
+            new_inst.leader_bk = Some(LeaderBookkeeping {
+                prepare_acks: Bitmap::new(self.population, false),
+                prepare_max_bal: 0,
+                accept_acks: Bitmap::new(self.population, false),
+            });
+            new_inst.external = true;
             self.insts.push(new_inst);
         }
 
@@ -435,6 +456,11 @@ impl RSPaxosReplica {
                                slot, inst.bal);
 
             // record update to largest accepted ballot and corresponding data
+            let subset_copy = inst.reqs_cw.subset_copy(
+                Bitmap::from(self.population, vec![self.id]),
+                false,
+            )?;
+            inst.voted = (inst.bal, subset_copy.clone());
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
@@ -442,10 +468,7 @@ impl RSPaxosReplica {
                         slot,
                         ballot: inst.bal,
                         // persist only one shard on myself
-                        reqs_cw: inst.reqs_cw.subset_copy(
-                            Bitmap::from(self.population, vec![self.id]),
-                            false,
-                        )?,
+                        reqs_cw: subset_copy,
                     },
                     sync: self.config.logger_sync,
                 },
@@ -485,8 +508,8 @@ impl RSPaxosReplica {
         pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}",
                            slot, self.insts[slot].bal);
         let inst = &self.insts[slot];
-        let voted = if inst.status >= Status::Accepting {
-            Some((inst.bal, inst.reqs_cw.clone()))
+        let voted = if inst.voted.0 > 0 {
+            Some(inst.voted.clone())
         } else {
             None
         };
@@ -643,17 +666,7 @@ impl RSPaxosReplica {
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
             while self.insts.len() <= slot {
-                self.insts.push(Instance {
-                    bal: 0,
-                    status: Status::Null,
-                    reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                        self.quorum_cnt,
-                        self.population - self.quorum_cnt,
-                    )?,
-                    leader_bk: None,
-                    replica_bk: None,
-                    external: false,
-                });
+                self.insts.push(self.null_instance()?);
             }
             let inst = &mut self.insts[slot];
             assert!(inst.bal <= ballot);
@@ -746,16 +759,18 @@ impl RSPaxosReplica {
                 }
 
                 // record update to largest accepted ballot and corresponding data
+                let subset_copy = inst.reqs_cw.subset_copy(
+                    Bitmap::from(self.population, vec![self.id]),
+                    false,
+                )?;
+                inst.voted = (ballot, subset_copy.clone());
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Accepting),
                     LogAction::Append {
                         entry: LogEntry::AcceptData {
                             slot,
                             ballot,
-                            reqs_cw: inst.reqs_cw.subset_copy(
-                                Bitmap::from(self.population, vec![self.id]),
-                                false,
-                            )?,
+                            reqs_cw: subset_copy,
                         },
                         sync: self.config.logger_sync,
                     },
@@ -803,17 +818,7 @@ impl RSPaxosReplica {
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
             while self.insts.len() <= slot {
-                self.insts.push(Instance {
-                    bal: 0,
-                    status: Status::Null,
-                    reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                        self.quorum_cnt,
-                        self.population - self.quorum_cnt,
-                    )?,
-                    leader_bk: None,
-                    replica_bk: None,
-                    external: false,
-                });
+                self.insts.push(self.null_instance()?);
             }
             let inst = &mut self.insts[slot];
             assert!(inst.bal <= ballot);
@@ -827,6 +832,7 @@ impl RSPaxosReplica {
             self.bal_max_seen = ballot;
 
             // record update to largest prepare ballot
+            inst.voted = (ballot, inst.reqs_cw.clone());
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
@@ -917,17 +923,7 @@ impl RSPaxosReplica {
 
         // locate instance in memory, filling in null instances if needed
         while self.insts.len() <= slot {
-            self.insts.push(Instance {
-                bal: 0,
-                status: Status::Null,
-                reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                    self.quorum_cnt,
-                    self.population - self.quorum_cnt,
-                )?,
-                leader_bk: None,
-                replica_bk: None,
-                external: false,
-            });
+            self.insts.push(self.null_instance()?);
         }
         let inst = &mut self.insts[slot];
 
@@ -1207,12 +1203,12 @@ impl RSPaxosReplica {
         paused: &mut bool,
     ) -> Result<(), SummersetError> {
         pf_warn!(self.id; "server got resume req");
-        *paused = false;
-        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
 
         // reset leader heartbeat timer
         self.kickoff_hb_hear_timer()?;
 
+        *paused = false;
+        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
         Ok(())
     }
 
@@ -1253,17 +1249,7 @@ impl RSPaxosReplica {
             LogEntry::PrepareBal { slot, ballot } => {
                 // locate instance in memory, filling in null instances if needed
                 while self.insts.len() <= slot {
-                    self.insts.push(Instance {
-                        bal: 0,
-                        status: Status::Null,
-                        reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                            self.quorum_cnt,
-                            self.population - self.quorum_cnt,
-                        )?,
-                        leader_bk: None,
-                        replica_bk: None,
-                        external: false,
-                    });
+                    self.insts.push(self.null_instance()?);
                 }
                 // update instance state
                 let inst = &mut self.insts[slot];
@@ -1286,23 +1272,14 @@ impl RSPaxosReplica {
             } => {
                 // locate instance in memory, filling in null instances if needed
                 while self.insts.len() <= slot {
-                    self.insts.push(Instance {
-                        bal: 0,
-                        status: Status::Null,
-                        reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                            self.quorum_cnt,
-                            self.population - self.quorum_cnt,
-                        )?,
-                        leader_bk: None,
-                        replica_bk: None,
-                        external: false,
-                    });
+                    self.insts.push(self.null_instance()?);
                 }
                 // update instance state
                 let inst = &mut self.insts[slot];
                 inst.bal = ballot;
                 inst.status = Status::Accepting;
-                inst.reqs_cw = reqs_cw;
+                inst.reqs_cw = reqs_cw.clone();
+                inst.voted = (ballot, reqs_cw);
                 // update bal_prepared and bal_max_seen
                 if self.bal_prepared < ballot {
                     self.bal_prepared = ballot;

From 33920992bec932aaee807f00040db271cbca8756 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@gmail.com>
Date: Sun, 17 Sep 2023 23:54:14 +0000
Subject: [PATCH 50/89] add recovery read msgs & fix sharding bugs

---
 src/protocols/crossword.rs             | 135 ++++++++++++++++++++++++-
 src/protocols/rs_paxos.rs              | 135 ++++++++++++++++++++++++-
 summerset_client/src/clients/tester.rs |   2 +-
 3 files changed, 267 insertions(+), 5 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 7bb12ff8..c9c6a8da 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -198,6 +198,16 @@ enum PeerMsg {
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
 
+    /// Recovery read from new leader to replicas.
+    Recover { slot: usize },
+
+    /// Recovery read reply from replica to leader.
+    RecoverReply {
+        slot: usize,
+        ballot: Ballot,
+        reqs_cw: RSCodeword<ReqBatch>,
+    },
+
     /// Leader activity heartbeat.
     Heartbeat { ballot: Ballot },
 }
@@ -816,8 +826,13 @@ impl CrosswordReplica {
             // instance using the request batch value constructed using shards
             // with the highest ballot number in quorum
             if leader_bk.prepare_acks.count() >= self.quorum_cnt
-                && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt
+                && inst.reqs_cw.avail_shards() >= self.quorum_cnt
             {
+                if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                    // have enough shards but need reconstruction
+                    inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
+                }
+
                 inst.status = Status::Accepting;
                 pf_debug!(self.id; "enter Accept phase for slot {} bal {}",
                                    slot, inst.bal);
@@ -1051,6 +1066,105 @@ impl CrosswordReplica {
         Ok(())
     }
 
+    /// Handler of Recover message from leader.
+    fn handle_msg_recover(
+        &mut self,
+        peer: ReplicaId,
+        slot: usize,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received Recover <- {} for slot {}", peer, slot);
+
+        // locate instance in memory, filling in null instances if needed
+        while self.insts.len() <= slot {
+            self.insts.push(self.null_instance()?);
+        }
+        let inst = &mut self.insts[slot];
+
+        // ignore spurious duplications; also ignore if I have nothing to send back
+        if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 {
+            return Ok(());
+        }
+
+        // send back my ballot for this slot and the available shards
+        self.transport_hub.send_msg(
+            PeerMsg::RecoverReply {
+                slot,
+                ballot: inst.bal,
+                reqs_cw: inst.reqs_cw.clone(),
+            },
+            peer,
+        )?;
+        pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}", slot, inst.bal);
+
+        Ok(())
+    }
+
+    /// Handler of Recover reply from replica.
+    fn handle_msg_recover_reply(
+        &mut self,
+        peer: ReplicaId,
+        slot: usize,
+        ballot: Ballot,
+        reqs_cw: RSCodeword<ReqBatch>,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received RecoverReply <- {} for slot {} bal {} shards {:?}",
+                           peer, slot, ballot, reqs_cw.avail_shards_map());
+        assert!(slot < self.insts.len());
+        assert!(self.insts[slot].status >= Status::Committed);
+        let num_insts = self.insts.len();
+        let inst = &mut self.insts[slot];
+
+        // if reply not outdated and ballot is up-to-date
+        if inst.status < Status::Executed && ballot >= inst.bal {
+            // absorb the shards from this replica
+            inst.reqs_cw.absorb_other(reqs_cw)?;
+
+            // if enough shards have been gathered, can push execution forward
+            if slot == self.commit_bar {
+                while self.commit_bar < num_insts {
+                    let inst = &mut self.insts[self.commit_bar];
+                    if inst.status < Status::Committed
+                        || inst.reqs_cw.avail_shards() < self.quorum_cnt
+                    {
+                        break;
+                    }
+
+                    if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                        // have enough shards but need reconstruction
+                        inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
+                    }
+                    let reqs = inst.reqs_cw.get_data()?;
+
+                    // submit commands in committed instance to the state machine
+                    // for execution
+                    if reqs.is_empty() {
+                        inst.status = Status::Executed;
+                    } else {
+                        for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
+                            if let ApiRequest::Req { cmd, .. } = req {
+                                self.state_machine.submit_cmd(
+                                    Self::make_command_id(
+                                        self.commit_bar,
+                                        cmd_idx,
+                                    ),
+                                    cmd.clone(),
+                                )?;
+                            } else {
+                                continue; // ignore other types of requests
+                            }
+                        }
+                        pf_trace!(self.id; "submitted {} exec commands for slot {}",
+                                           reqs.len(), self.commit_bar);
+                    }
+
+                    self.commit_bar += 1;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     /// Synthesized handler of receiving message from peer.
     fn handle_msg_recv(
         &mut self,
@@ -1075,6 +1189,12 @@ impl CrosswordReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
+            PeerMsg::Recover { slot } => self.handle_msg_recover(peer, slot),
+            PeerMsg::RecoverReply {
+                slot,
+                ballot,
+                reqs_cw,
+            } => self.handle_msg_recover_reply(peer, slot, ballot, reqs_cw),
             PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot),
         }
     }
@@ -1155,8 +1275,8 @@ impl CrosswordReplica {
         self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
         self.bal_max_seen = self.bal_prep_sent;
 
-        // redo Prepare phase for all in-progress instances
         for (slot, inst) in self.insts.iter_mut().enumerate() {
+            // redo Prepare phase for all in-progress instances
             if inst.status < Status::Committed {
                 inst.bal = self.bal_prep_sent;
                 inst.status = Status::Preparing;
@@ -1188,6 +1308,17 @@ impl CrosswordReplica {
                 pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}",
                                    slot, inst.bal);
             }
+
+            // do recovery reads for all committed instances that do not
+            // hold enough available shards for reconstruction
+            if inst.status == Status::Committed
+                && inst.reqs_cw.avail_shards() < self.quorum_cnt
+            {
+                self.transport_hub
+                    .bcast_msg(PeerMsg::Recover { slot }, None)?;
+                pf_trace!(self.id; "broadcast Recover messages for slot {} bal {} shards {:?}",
+                                   slot, inst.bal, inst.reqs_cw.avail_shards_map());
+            }
         }
 
         Ok(())
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index f0320a9e..9a79da2e 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -192,6 +192,16 @@ enum PeerMsg {
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
 
+    /// Recovery read from new leader to replicas.
+    Recover { slot: usize },
+
+    /// Recovery read reply from replica to leader.
+    RecoverReply {
+        slot: usize,
+        ballot: Ballot,
+        reqs_cw: RSCodeword<ReqBatch>,
+    },
+
     /// Leader activity heartbeat.
     Heartbeat { ballot: Ballot },
 }
@@ -743,8 +753,13 @@ impl RSPaxosReplica {
             // instance using the request batch value constructed using shards
             // with the highest ballot number in quorum
             if leader_bk.prepare_acks.count() >= self.quorum_cnt
-                && inst.reqs_cw.avail_data_shards() >= self.quorum_cnt
+                && inst.reqs_cw.avail_shards() >= self.quorum_cnt
             {
+                if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                    // have enough shards but need reconstruction
+                    inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
+                }
+
                 inst.status = Status::Accepting;
                 pf_debug!(self.id; "enter Accept phase for slot {} bal {}",
                                    slot, inst.bal);
@@ -951,6 +966,105 @@ impl RSPaxosReplica {
         Ok(())
     }
 
+    /// Handler of Recover message from leader.
+    fn handle_msg_recover(
+        &mut self,
+        peer: ReplicaId,
+        slot: usize,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received Recover <- {} for slot {}", peer, slot);
+
+        // locate instance in memory, filling in null instances if needed
+        while self.insts.len() <= slot {
+            self.insts.push(self.null_instance()?);
+        }
+        let inst = &mut self.insts[slot];
+
+        // ignore spurious duplications; also ignore if I have nothing to send back
+        if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 {
+            return Ok(());
+        }
+
+        // send back my ballot for this slot and the available shards
+        self.transport_hub.send_msg(
+            PeerMsg::RecoverReply {
+                slot,
+                ballot: inst.bal,
+                reqs_cw: inst.reqs_cw.clone(),
+            },
+            peer,
+        )?;
+        pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}", slot, inst.bal);
+
+        Ok(())
+    }
+
+    /// Handler of Recover reply from replica.
+    fn handle_msg_recover_reply(
+        &mut self,
+        peer: ReplicaId,
+        slot: usize,
+        ballot: Ballot,
+        reqs_cw: RSCodeword<ReqBatch>,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received RecoverReply <- {} for slot {} bal {} shards {:?}",
+                           peer, slot, ballot, reqs_cw.avail_shards_map());
+        assert!(slot < self.insts.len());
+        assert!(self.insts[slot].status >= Status::Committed);
+        let num_insts = self.insts.len();
+        let inst = &mut self.insts[slot];
+
+        // if reply not outdated and ballot is up-to-date
+        if inst.status < Status::Executed && ballot >= inst.bal {
+            // absorb the shards from this replica
+            inst.reqs_cw.absorb_other(reqs_cw)?;
+
+            // if enough shards have been gathered, can push execution forward
+            if slot == self.commit_bar {
+                while self.commit_bar < num_insts {
+                    let inst = &mut self.insts[self.commit_bar];
+                    if inst.status < Status::Committed
+                        || inst.reqs_cw.avail_shards() < self.quorum_cnt
+                    {
+                        break;
+                    }
+
+                    if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                        // have enough shards but need reconstruction
+                        inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
+                    }
+                    let reqs = inst.reqs_cw.get_data()?;
+
+                    // submit commands in committed instance to the state machine
+                    // for execution
+                    if reqs.is_empty() {
+                        inst.status = Status::Executed;
+                    } else {
+                        for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
+                            if let ApiRequest::Req { cmd, .. } = req {
+                                self.state_machine.submit_cmd(
+                                    Self::make_command_id(
+                                        self.commit_bar,
+                                        cmd_idx,
+                                    ),
+                                    cmd.clone(),
+                                )?;
+                            } else {
+                                continue; // ignore other types of requests
+                            }
+                        }
+                        pf_trace!(self.id; "submitted {} exec commands for slot {}",
+                                           reqs.len(), self.commit_bar);
+                    }
+
+                    self.commit_bar += 1;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     /// Synthesized handler of receiving message from peer.
     fn handle_msg_recv(
         &mut self,
@@ -975,6 +1089,12 @@ impl RSPaxosReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
+            PeerMsg::Recover { slot } => self.handle_msg_recover(peer, slot),
+            PeerMsg::RecoverReply {
+                slot,
+                ballot,
+                reqs_cw,
+            } => self.handle_msg_recover_reply(peer, slot, ballot, reqs_cw),
             PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot),
         }
     }
@@ -1055,8 +1175,8 @@ impl RSPaxosReplica {
         self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
         self.bal_max_seen = self.bal_prep_sent;
 
-        // redo Prepare phase for all in-progress instances
         for (slot, inst) in self.insts.iter_mut().enumerate() {
+            // redo Prepare phase for all in-progress instances
             if inst.status < Status::Committed {
                 inst.bal = self.bal_prep_sent;
                 inst.status = Status::Preparing;
@@ -1088,6 +1208,17 @@ impl RSPaxosReplica {
                 pf_trace!(self.id; "broadcast Prepare messages for slot {} bal {}",
                                    slot, inst.bal);
             }
+
+            // do recovery reads for all committed instances that do not
+            // hold enough available shards for reconstruction
+            if inst.status == Status::Committed
+                && inst.reqs_cw.avail_shards() < self.quorum_cnt
+            {
+                self.transport_hub
+                    .bcast_msg(PeerMsg::Recover { slot }, None)?;
+                pf_trace!(self.id; "broadcast Recover messages for slot {} bal {} shards {:?}",
+                                   slot, inst.bal, inst.reqs_cw.avail_shards_map());
+            }
         }
 
         Ok(())
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index 7da4d5df..240a18ff 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -235,7 +235,7 @@ impl ClientTester {
             self.driver.id;
             "client-side timeout {} ms {} times",
             self.timeout.as_millis(),
-            max_timeouts
+            max_timeouts + 1
         )
     }
 

From e416476563e4f34698f3964336b298b7b9421d08 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@gmail.com>
Date: Mon, 18 Sep 2023 00:18:44 +0000
Subject: [PATCH 51/89] add scripted tests to github workflow beside unit tests

---
 .github/workflows/tests_proc.yml              |  20 ++++
 .../workflows/{tests.yml => tests_unit.yml}   |   2 +-
 README.md                                     |   5 +-
 scripts/workflow_test.py                      | 109 ++++++++++++++++++
 4 files changed, 133 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/tests_proc.yml
 rename .github/workflows/{tests.yml => tests_unit.yml} (90%)
 create mode 100644 scripts/workflow_test.py

diff --git a/.github/workflows/tests_proc.yml b/.github/workflows/tests_proc.yml
new file mode 100644
index 00000000..dc8c63d9
--- /dev/null
+++ b/.github/workflows/tests_proc.yml
@@ -0,0 +1,20 @@
+name: Tests
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  tests:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Run proc tests
+      run: python3 scripts/workflow_test.py
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests_unit.yml
similarity index 90%
rename from .github/workflows/tests.yml
rename to .github/workflows/tests_unit.yml
index 5c4a843d..91d6ca77 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests_unit.yml
@@ -16,5 +16,5 @@ jobs:
 
     steps:
     - uses: actions/checkout@v3
-    - name: Run tests
+    - name: Run unit tests
       run: cargo test --workspace --verbose
diff --git a/README.md b/README.md
index 0d008572..a7250245 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,8 @@ This is a private mirror of [Summerset](https://github.com/josehu07/summerset).
 
 [![Format check](https://github.com/josehu07/summerset-private/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Aformat)
 [![Build status](https://github.com/josehu07/summerset-private/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Abuild)
-[![Tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Atests)
+[![Unit tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests_unit.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Atests_unit)
+[![Proc tests status](https://github.com/josehu07/summerset-private/actions/workflows/tests_proc.yml/badge.svg)](https://github.com/josehu07/summerset-private/actions?query=josehu07%3Atests_proc)
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 
 To create a branch to track public repo `main`, pull new things from it, and merge into the private `main`:
@@ -155,7 +156,7 @@ Complete cluster management and benchmarking scripts are available in another re
   - [ ] membership discovery & view changes
 - [ ] implementation of Raft
 - [ ] implementation of Crossword prototype
-  - [ ] fault recovery reads
+  - [x] fault recovery reads
   - [ ] follower gossiping
 - [x] client-side utilities
   - [x] REPL-style client
diff --git a/scripts/workflow_test.py b/scripts/workflow_test.py
new file mode 100644
index 00000000..33484aca
--- /dev/null
+++ b/scripts/workflow_test.py
@@ -0,0 +1,109 @@
+import sys
+import os
+import subprocess
+
+
+def do_cargo_build():
+    print("Building everything...")
+    cmd = ["cargo", "build", "--workspace"]
+    proc = subprocess.Popen(cmd)
+    proc.wait()
+
+
+def run_process(cmd):
+    # print("Run:", " ".join(cmd))
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    return proc
+
+
+def kill_all_matching(name, force=False):
+    # print("Kill all:", name)
+    assert name.count(" ") == 0
+    try:
+        pgrep_cmd = f"sudo pgrep -f {name}"
+        pids = subprocess.check_output(pgrep_cmd, shell=True).decode()
+        pids = pids.strip().split("\n")
+        for pid in pids:
+            pid = pid.strip()
+            if len(pid) > 0:
+                kill_cmd = f"sudo kill -9" if force else "sudo kill"
+                kill_cmd += f" {int(pid)} > /dev/null 2>&1"
+                os.system(kill_cmd)
+    except subprocess.CalledProcessError:
+        pass
+
+
+def launch_cluster(protocol, num_replicas, config):
+    cmd = [
+        "python3",
+        "./scripts/local_cluster.py",
+        "-p",
+        protocol,
+        "-n",
+        str(num_replicas),
+    ]
+    if config is not None and len(config) > 0:
+        cmd += ["--config", config]
+    return run_process(cmd)
+
+
+def wait_cluster_setup(proc, num_replicas):
+    accepting_clients = [False for _ in range(num_replicas)]
+
+    for line in iter(proc.stderr.readline, b""):
+        l = line.decode()
+        print(l, end="", file=sys.stderr)
+        if "manager" not in l and "accepting clients" in l:
+            replica = int(l[l.find("(") + 1 : l.find(")")])
+            assert not accepting_clients[replica]
+            accepting_clients[replica] = True
+
+        if accepting_clients.count(True) == num_replicas:
+            break
+
+
+def run_tester_client(protocol, test_name):
+    cmd = [
+        "python3",
+        "./scripts/local_client.py",
+        "-p",
+        protocol,
+        "tester",
+        "-t",
+        test_name,
+    ]
+    return run_process(cmd)
+
+
+if __name__ == "__main__":
+    do_cargo_build()
+
+    kill_all_matching("local_client.py", force=True)
+    kill_all_matching("local_cluster.py", force=True)
+    kill_all_matching("summerset_client", force=True)
+    kill_all_matching("summerset_server", force=True)
+    kill_all_matching("summerset_manager", force=True)
+
+    PROTOCOL = "MultiPaxos"
+    NUM_REPLICAS = 3
+    TEST_NAME = "primitive_ops"
+    TIMEOUT = 300
+
+    proc_cluster = launch_cluster(PROTOCOL, NUM_REPLICAS, config=None)
+    wait_cluster_setup(proc_cluster, NUM_REPLICAS)
+
+    proc_client = run_tester_client(PROTOCOL, TEST_NAME)
+
+    try:
+        client_rc = proc_client.wait(timeout=TIMEOUT)
+    except subprocess.TimeoutExpired:
+        print(f"Client tester did not finish in {TIMEOUT} secs")
+        exit(1)
+
+    proc_cluster.terminate()
+
+    if client_rc != 0:
+        print(f"Client tester exitted with {client_rc}")
+        exit(client_rc)
+    else:
+        exit(0)

From 70e6093561ab76c75ed60d5a280ad5da180ef203 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@gmail.com>
Date: Mon, 18 Sep 2023 00:21:55 +0000
Subject: [PATCH 52/89] minor updates to workflow job names

---
 .github/workflows/tests_proc.yml | 4 ++--
 .github/workflows/tests_unit.yml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/tests_proc.yml b/.github/workflows/tests_proc.yml
index dc8c63d9..e8fbde3f 100644
--- a/.github/workflows/tests_proc.yml
+++ b/.github/workflows/tests_proc.yml
@@ -1,4 +1,4 @@
-name: Tests
+name: Proc tests
 
 on:
   push:
@@ -10,7 +10,7 @@ env:
   CARGO_TERM_COLOR: always
 
 jobs:
-  tests:
+  tests_proc:
 
     runs-on: ubuntu-latest
 
diff --git a/.github/workflows/tests_unit.yml b/.github/workflows/tests_unit.yml
index 91d6ca77..0a1fd8d6 100644
--- a/.github/workflows/tests_unit.yml
+++ b/.github/workflows/tests_unit.yml
@@ -1,4 +1,4 @@
-name: Tests
+name: Unit tests
 
 on:
   push:
@@ -10,7 +10,7 @@ env:
   CARGO_TERM_COLOR: always
 
 jobs:
-  tests:
+  tests_unit:
 
     runs-on: ubuntu-latest
 

From de724566524e784ca3f1850aa18ba58616c1ee04 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sat, 23 Sep 2023 22:58:42 -0500
Subject: [PATCH 53/89] staging progress on snapshotting

---
 scripts/local_cluster.py              |  14 +-
 src/manager/clusman.rs                |  89 ++++++++-
 src/manager/reactor.rs                |  12 ++
 src/manager/reigner.rs                |   6 +
 src/protocols/crossword.rs            |  14 +-
 src/protocols/multipaxos.rs           | 257 +++++++++++++++++++++++++-
 src/protocols/rs_paxos.rs             |  14 +-
 src/protocols/simple_push.rs          |   2 +-
 src/server/transport.rs               |   5 +
 summerset_client/src/clients/bench.rs |   1 -
 10 files changed, 383 insertions(+), 31 deletions(-)

diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index c4e0877c..5b0b0658 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -48,13 +48,21 @@ def kill_all_matching(name, force=False):
     "Crossword": lambda r: f"backer_path='/tmp/summerset.crossword.{r}.wal'",
 }
 
+PROTOCOL_SNAPSHOT_PATH = {
+    "MultiPaxos": lambda r: f"snapshot_path='/tmp/summerset.multipaxos.{r}.snap'",
+}
+
 
-def config_with_backer_path(protocol, config, replica):
+def config_with_file_paths(protocol, config, replica):
     result_config = PROTOCOL_BACKER_PATH[protocol](replica)
+    if protocol in PROTOCOL_SNAPSHOT_PATH:
+        result_config += "+"
+        result_config += PROTOCOL_SNAPSHOT_PATH[protocol](replica)
 
     if config is not None and len(config) > 0:
-        if "backer_path" in config:
+        if "backer_path" in config or "snapshot_path" in config:
             result_config = config  # use user-supplied path
+            # NOTE: ignores the other one
         else:
             result_config += "+"
             result_config += config
@@ -132,7 +140,7 @@ def launch_servers(protocol, num_replicas, release, config):
             SERVER_API_PORT(replica),
             SERVER_P2P_PORT(replica),
             f"127.0.0.1:{MANAGER_SRV_PORT}",
-            config_with_backer_path(protocol, config, replica),
+            config_with_file_paths(protocol, config, replica),
             release,
         )
         proc = run_process(cmd)
diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index 7b08e4d9..58b4a3b3 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -27,6 +27,9 @@ struct ServerInfo {
 
     /// This server is currently paused?
     is_paused: bool,
+
+    /// In-mem log start index after latest snapshot.
+    start_slot: usize,
 }
 
 /// Standalone cluster manager oracle.
@@ -204,6 +207,7 @@ impl ClusterManager {
                 p2p_addr,
                 is_leader: false,
                 is_paused: false,
+                start_slot: 0,
             },
         );
         Ok(())
@@ -231,6 +235,28 @@ impl ClusterManager {
         }
     }
 
+    /// Handler of autonomous SnapshotUpTo message.
+    fn handle_snapshot_up_to(
+        &mut self,
+        server: ReplicaId,
+        new_start: usize,
+    ) -> Result<(), SummersetError> {
+        if !self.server_info.contains_key(&server) {
+            return logged_err!("m"; "snapshot up to got unknown ID: {}", server);
+        }
+
+        // update this server's info
+        let info = self.server_info.get_mut(&server).unwrap();
+        if new_start < info.start_slot {
+            logged_err!("m"; "server {} snapshot up to {} < {}",
+                             server, new_start,
+                             self.server_info[&server].start_slot)
+        } else {
+            info.start_slot = new_start;
+            Ok(())
+        }
+    }
+
     /// Synthesized handler of server-initiated control messages.
     async fn handle_ctrl_msg(
         &mut self,
@@ -258,6 +284,10 @@ impl ClusterManager {
                 self.handle_leader_status(server, step_up)?;
             }
 
+            CtrlMsg::SnapshotUpTo { new_start } => {
+                self.handle_snapshot_up_to(server, new_start)?;
+            }
+
             _ => {} // ignore all other types
         }
 
@@ -364,7 +394,7 @@ impl ClusterManager {
         // pause specified server(s)
         let mut pause_done = HashSet::new();
         while let Some(s) = servers.pop() {
-            // send puase server control message to server
+            // send pause server control message to server
             self.server_reigner.send_ctrl(CtrlMsg::Pause, s)?;
 
             // set the is_paused flag
@@ -404,19 +434,19 @@ impl ClusterManager {
         // resume specified server(s)
         let mut resume_done = HashSet::new();
         while let Some(s) = servers.pop() {
-            // send puase server control message to server
+            // send resume server control message to server
             self.server_reigner.send_ctrl(CtrlMsg::Resume, s)?;
 
-            // clear the is_paused flag
-            assert!(self.server_info.contains_key(&s));
-            self.server_info.get_mut(&s).unwrap().is_paused = false;
-
             // wait for dummy reply
             let (_, reply) = self.server_reigner.recv_ctrl().await?;
             if reply != CtrlMsg::ResumeReply {
                 return logged_err!("m"; "unexpected reply type received");
             }
 
+            // clear the is_paused flag
+            assert!(self.server_info.contains_key(&s));
+            self.server_info.get_mut(&s).unwrap().is_paused = false;
+
             resume_done.insert(s);
         }
 
@@ -428,6 +458,49 @@ impl ClusterManager {
         )
     }
 
+    /// Handler of client TakeSnapshot rquest.
+    async fn handle_client_take_snapshot(
+        &mut self,
+        client: ClientId,
+        servers: HashSet<ReplicaId>,
+    ) -> Result<(), SummersetError> {
+        let mut servers: Vec<ReplicaId> = if servers.is_empty() {
+            // all active servers
+            self.server_info.keys().copied().collect()
+        } else {
+            servers.into_iter().collect()
+        };
+
+        // tell specified server(s)
+        let mut snapshot_up_to = HashMap::new();
+        while let Some(s) = servers.pop() {
+            // send take snapshot control message to server
+            self.server_reigner.send_ctrl(CtrlMsg::TakeSnapshot, s)?;
+
+            // wait for reply
+            let (_, reply) = self.server_reigner.recv_ctrl().await?;
+            if let CtrlMsg::SnapshotUpTo { new_start } = reply {
+                // update the log start index info
+                assert!(self.server_info.contains_key(&s));
+                if new_start < self.server_info[&s].start_slot {
+                    return logged_err!("m"; "server {} snapshot up to {} < {}",
+                                            s, new_start,
+                                            self.server_info[&s].start_slot);
+                } else {
+                    self.server_info.get_mut(&s).unwrap().start_slot =
+                        new_start;
+                }
+
+                snapshot_up_to.insert(s, new_start);
+            } else {
+                return logged_err!("m"; "unexpected reply type received");
+            }
+        }
+
+        self.client_reactor
+            .send_reply(CtrlReply::TakeSnapshot { snapshot_up_to }, client)
+    }
+
     /// Synthesized handler of client-initiated control requests.
     async fn handle_ctrl_req(
         &mut self,
@@ -453,6 +526,10 @@ impl ClusterManager {
                 self.handle_client_resume_servers(client, servers).await?;
             }
 
+            CtrlRequest::TakeSnapshot { servers } => {
+                self.handle_client_take_snapshot(client, servers).await?;
+            }
+
             _ => {} // ignore all other types
         }
 
diff --git a/src/manager/reactor.rs b/src/manager/reactor.rs
index 3273c54f..64ede623 100644
--- a/src/manager/reactor.rs
+++ b/src/manager/reactor.rs
@@ -45,6 +45,12 @@ pub enum CtrlRequest {
         servers: HashSet<ReplicaId>,
     },
 
+    /// Tell the servers to take a snapshot now.
+    TakeSnapshot {
+        /// IDs of servers to take snapshot. If empty, tells all servers.
+        servers: HashSet<ReplicaId>,
+    },
+
     /// Client leave notification.
     Leave,
 }
@@ -69,6 +75,12 @@ pub enum CtrlReply {
     /// Reply to server resume request.
     ResumeServers { servers: HashSet<ReplicaId> },
 
+    /// Reply to take snapshot request.
+    TakeSnapshot {
+        /// Map from replica ID -> new log start index.
+        snapshot_up_to: HashMap<ReplicaId, usize>,
+    },
+
     /// Reply to client leave notification.
     Leave,
 }
diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs
index c3551e30..41ae38ec 100644
--- a/src/manager/reigner.rs
+++ b/src/manager/reigner.rs
@@ -58,6 +58,12 @@ pub enum CtrlMsg {
     /// Server -> Manager: dummy resume reply.
     ResumeReply,
 
+    /// Manager -> Server: tell server to take a snapshot now.
+    TakeSnapshot,
+
+    /// Server -> Manager: server took snapshot up to log index.
+    SnapshotUpTo { new_start: usize },
+
     /// Server -> Manager: leave notification.
     Leave,
 
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index c9c6a8da..424b0cc8 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -1610,7 +1610,7 @@ impl CrosswordReplica {
         {
             Ok(())
         } else {
-            logged_err!(self.id; "unexpected log result type")
+            logged_err!(self.id; "unexpected log result type or failed truncate")
         }
     }
 }
@@ -1824,7 +1824,7 @@ impl GenericReplica for CrosswordReplica {
                     if let Err(e) = self.handle_msg_recv(peer, msg) {
                         pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e);
                     }
-                }
+                },
 
                 // state machine execution result
                 cmd_result = self.state_machine.get_result(), if !paused => {
@@ -1840,13 +1840,17 @@ impl GenericReplica for CrosswordReplica {
 
                 // leader inactivity timeout
                 _ = self.hb_hear_timer.timeout(), if !paused => {
-                    self.become_a_leader()?;
+                    if let Err(e) = self.become_a_leader() {
+                        pf_error!(self.id; "error becoming a leader: {}", e);
+                    }
                 },
 
                 // leader sending heartbeat
                 _ = self.hb_send_interval.tick(), if !paused && self.is_leader => {
-                    self.bcast_heartbeats()?;
-                }
+                    if let Err(e) = self.bcast_heartbeats() {
+                        pf_error!(self.id; "error broadcasting heartbeats: {}", e);
+                    }
+                },
 
                 // manager control message
                 ctrl_msg = self.control_hub.recv_ctrl() => {
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index e5024071..58fbaa84 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -14,9 +14,9 @@ use std::net::SocketAddr;
 use crate::utils::{SummersetError, Bitmap, Timer};
 use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
 use crate::server::{
-    ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi,
-    ApiRequest, ApiReply, StorageHub, LogAction, LogResult, LogActionId,
-    TransportHub, GenericReplica,
+    ReplicaId, ControlHub, StateMachine, Command, CommandResult, CommandId,
+    ExternalApi, ApiRequest, ApiReply, StorageHub, LogAction, LogResult,
+    LogActionId, TransportHub, GenericReplica,
 };
 use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint};
 use crate::protocols::SmrProtocol;
@@ -41,7 +41,7 @@ pub struct ReplicaConfigMultiPaxos {
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
 
-    /// Path to backing file.
+    /// Path to backing log file.
     pub backer_path: String,
 
     /// Whether to call `fsync()`/`fdatasync()` on logger.
@@ -56,6 +56,13 @@ pub struct ReplicaConfigMultiPaxos {
     /// Interval of leader sending heartbeats to followers.
     pub hb_send_interval_ms: u64,
 
+    /// Path to snapshot file.
+    pub snapshot_path: String,
+
+    /// Snapshot self-triggering interval in secs. 0 means never trigger
+    /// snapshotting autonomously.
+    pub snapshot_interval_s: u64,
+
     // Performance simulation params (all zeros means no perf simulation):
     pub perf_storage_a: u64,
     pub perf_storage_b: u64,
@@ -74,6 +81,8 @@ impl Default for ReplicaConfigMultiPaxos {
             hb_hear_timeout_min: 300,
             hb_hear_timeout_max: 600,
             hb_send_interval_ms: 50,
+            snapshot_path: "/tmp/summerset.multipaxos.snap".into(),
+            snapshot_interval_s: 0,
             perf_storage_a: 0,
             perf_storage_b: 0,
             perf_network_a: 0,
@@ -162,6 +171,17 @@ enum LogEntry {
     CommitSlot { slot: usize },
 }
 
+/// Snapshot file entry type.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
+enum SnapEntry {
+    /// First entry at the start of file: number of log instances covered by
+    /// this snapshot file == the start slot index of in-mem log.
+    StartSlot { slot: usize },
+
+    /// Key-value pair entry to apply to the state.
+    NewKVPair { key: String, value: String },
+}
+
 /// Peer-peer message type.
 #[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
 enum PeerMsg {
@@ -226,6 +246,9 @@ pub struct MultiPaxosReplica {
     /// StorageHub module.
     storage_hub: StorageHub<LogEntry>,
 
+    /// StorageHub module for the snapshot file.
+    snapshot_hub: StorageHub<SnapEntry>,
+
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
@@ -241,6 +264,12 @@ pub struct MultiPaxosReplica {
     /// In-memory log of instances.
     insts: Vec<Instance>,
 
+    /// Start slot index of in-mem log after latest snapshot.
+    start_slot: usize,
+
+    /// Timer for taking a new autonomous snapshot.
+    snapshot_interval: Option<Interval>,
+
     /// Largest ballot number that a leader has sent Prepare messages in.
     bal_prep_sent: Ballot,
 
@@ -259,6 +288,9 @@ pub struct MultiPaxosReplica {
 
     /// Current durable log file offset.
     log_offset: usize,
+
+    /// Current durable snapshot file offset.
+    snap_offset: usize,
 }
 
 impl MultiPaxosReplica {
@@ -1124,6 +1156,19 @@ impl MultiPaxosReplica {
         Ok(())
     }
 
+    /// Handler of TakeSnapshot control message.
+    async fn handle_ctrl_take_snapshot(
+        &mut self,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server told to take snapshot");
+        self.take_new_snapshot().await?;
+
+        self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
+            new_start: self.start_slot,
+        })?;
+        Ok(())
+    }
+
     /// Synthesized handler of manager control messages. If ok, returns
     /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
     /// decides to shutdown completely, and `None` if not terminating.
@@ -1148,6 +1193,11 @@ impl MultiPaxosReplica {
                 Ok(None)
             }
 
+            CtrlMsg::TakeSnapshot => {
+                self.handle_ctrl_take_snapshot().await?;
+                Ok(None)
+            }
+
             _ => Ok(None), // ignore all other types
         }
     }
@@ -1276,7 +1326,154 @@ impl MultiPaxosReplica {
         {
             Ok(())
         } else {
-            logged_err!(self.id; "unexpected log result type")
+            logged_err!(self.id; "unexpected log result type or failed truncate")
+        }
+    }
+
+    /// Take a snapshot up to current exec_idx, then discard the in-mem log up
+    /// to that index, and squash the durable WAL log file.
+    ///
+    /// NOTE: the current implementation does not guard against crashes in the
+    /// middle of taking a snapshot.
+    async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
+        pf_debug!(self.id; "taking a new snapshot: start {} exec {}",
+                           self.start_slot, self.exec_bar);
+        assert!(self.exec_bar >= self.start_slot);
+        if self.exec_bar == self.start_slot {
+            return Ok(());
+        }
+
+        // dump all Puts in executed instances
+        for slot in self.start_slot..self.exec_bar {
+            let inst = &self.insts[slot - self.start_slot];
+            for (_, req) in &inst.reqs {
+                if let ApiRequest::Req {
+                    cmd: Command::Put { key, value },
+                    ..
+                } = req
+                {
+                    self.snapshot_hub.submit_action(
+                        0, // using 0 as dummy log action ID
+                        LogAction::Append {
+                            entry: SnapEntry::NewKVPair {
+                                key: key.clone(),
+                                value: value.clone(),
+                            },
+                            sync: self.config.logger_sync,
+                        },
+                    )?;
+                    let (_, log_result) =
+                        self.snapshot_hub.get_result().await?;
+                    if let LogResult::Write {
+                        offset_ok: true,
+                        now_size,
+                    } = log_result
+                    {
+                        self.snap_offset = now_size;
+                    } else {
+                        return logged_err!(
+                            self.id;
+                            "unexpected log result type or failed write"
+                        );
+                    }
+                }
+            }
+        }
+
+        // update start_slot and discard all in-memory log instances up to exec_bar
+        self.insts.drain(0..(self.exec_bar - self.start_slot));
+        self.start_slot = self.exec_bar;
+
+        // TODO: squash the durable WAL log
+
+        pf_debug!(self.id; "took new snapshot up to: start {}", self.start_slot);
+        Ok(())
+    }
+
+    /// Recover initial state from durable storage snapshot file.
+    async fn recover_from_snapshot(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.snap_offset, 0);
+
+        // first, try to read the first several bytes, which should record the
+        // start_slot index
+        self.snapshot_hub
+            .submit_action(0, LogAction::Read { offset: 0 })?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+
+        match log_result {
+            LogResult::Read {
+                entry: Some(SnapEntry::StartSlot { slot }),
+                end_offset,
+            } => {
+                self.snap_offset = end_offset;
+                self.start_slot = slot; // get start slot index of in-mem log
+
+                // repeatedly apply key-value pairs
+                loop {
+                    self.snapshot_hub.submit_action(
+                        0,
+                        LogAction::Read {
+                            offset: self.snap_offset,
+                        },
+                    )?;
+                    let (_, log_result) =
+                        self.snapshot_hub.get_result().await?;
+
+                    match log_result {
+                        LogResult::Read {
+                            entry: Some(SnapEntry::NewKVPair { key, value }),
+                            end_offset,
+                        } => {
+                            // execute a Put command on state machine
+                            self.state_machine
+                                .submit_cmd(0, Command::Put { key, value })?;
+                            let _ = self.state_machine.get_result().await?;
+                            // update snapshot file offset
+                            self.snap_offset = end_offset;
+                        }
+                        LogResult::Read { entry: None, .. } => {
+                            // end of log reached
+                            break;
+                        }
+                        _ => {
+                            return logged_err!(self.id; "unexpected log result type");
+                        }
+                    }
+                }
+
+                // tell manager about my start_slot index
+                self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
+                    new_start: self.start_slot,
+                })?;
+                Ok(())
+            }
+
+            LogResult::Read { entry: None, .. } => {
+                // snapshot file is empty. Write a 0 as start_slot and return
+                self.snapshot_hub.submit_action(
+                    0,
+                    LogAction::Write {
+                        entry: SnapEntry::StartSlot { slot: 0 },
+                        offset: 0,
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                let (_, log_result) = self.snapshot_hub.get_result().await?;
+                if let LogResult::Write {
+                    offset_ok: true,
+                    now_size,
+                } = log_result
+                {
+                    self.snap_offset = now_size;
+                    Ok(())
+                } else {
+                    logged_err!(self.id; "unexpected log result type or failed truncate")
+                }
+            }
+
+            _ => {
+                logged_err!(self.id; "unexpected log result type")
+            }
         }
     }
 }
@@ -1383,6 +1580,14 @@ impl GenericReplica for MultiPaxosReplica {
         }
         transport_hub.wait_for_group(population).await?;
 
+        // setup snapshot hub module
+        let snapshot_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.snapshot_path),
+            None,
+        )
+        .await?;
+
         // setup external API module, ready to take in client requests
         let external_api = ExternalApi::new_and_setup(
             id,
@@ -1396,6 +1601,15 @@ impl GenericReplica for MultiPaxosReplica {
             time::interval(Duration::from_millis(config.hb_send_interval_ms));
         hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
+        let snapshot_interval = if config.snapshot_interval_s == 0 {
+            None
+        } else {
+            let mut si =
+                time::interval(Duration::from_secs(config.snapshot_interval_s));
+            si.set_missed_tick_behavior(MissedTickBehavior::Skip);
+            Some(si)
+        };
+
         Ok(MultiPaxosReplica {
             id,
             population,
@@ -1407,17 +1621,21 @@ impl GenericReplica for MultiPaxosReplica {
             external_api,
             state_machine,
             storage_hub,
+            snapshot_hub,
             transport_hub,
             hb_hear_timer: Timer::new(),
             hb_send_interval,
             is_leader: false,
             insts: vec![],
+            start_slot: 0,
+            snapshot_interval,
             bal_prep_sent: 0,
             bal_prepared: 0,
             bal_max_seen: 0,
             commit_bar: 0,
             exec_bar: 0,
             log_offset: 0,
+            snap_offset: 0,
         })
     }
 
@@ -1425,7 +1643,10 @@ impl GenericReplica for MultiPaxosReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
-        // recover state from durable storage log
+        // recover state from durable snapshot file
+        self.recover_from_snapshot().await?;
+
+        // recover the tail-piece memory log & state from durable storage log
         self.recover_from_log().await?;
 
         // kick off leader activity hearing timer
@@ -1470,7 +1691,7 @@ impl GenericReplica for MultiPaxosReplica {
                     if let Err(e) = self.handle_msg_recv(peer, msg) {
                         pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e);
                     }
-                }
+                },
 
                 // state machine execution result
                 cmd_result = self.state_machine.get_result(), if !paused => {
@@ -1486,13 +1707,29 @@ impl GenericReplica for MultiPaxosReplica {
 
                 // leader inactivity timeout
                 _ = self.hb_hear_timer.timeout(), if !paused => {
-                    self.become_a_leader()?;
+                    if let Err(e) = self.become_a_leader() {
+                        pf_error!(self.id; "error becoming a leader: {}", e);
+                    }
                 },
 
                 // leader sending heartbeat
                 _ = self.hb_send_interval.tick(), if !paused && self.is_leader => {
-                    self.bcast_heartbeats()?;
-                }
+                    if let Err(e) = self.bcast_heartbeats() {
+                        pf_error!(self.id; "error broadcasting heartbeats: {}", e);
+                    }
+                },
+
+                // autonomous snapshot taking timeout
+                _ = self.snapshot_interval.as_mut().unwrap().tick(), if !paused
+                        && self.snapshot_interval.is_some() => {
+                    if let Err(e) = self.take_new_snapshot().await {
+                        pf_error!(self.id; "error taking a new snapshot: {}", e);
+                    } else {
+                        self.control_hub.send_ctrl(
+                            CtrlMsg::SnapshotUpTo { new_start: self.start_slot }
+                        )?;
+                    }
+                },
 
                 // manager control message
                 ctrl_msg = self.control_hub.recv_ctrl() => {
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 9a79da2e..6df73631 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -1510,7 +1510,7 @@ impl RSPaxosReplica {
         {
             Ok(())
         } else {
-            logged_err!(self.id; "unexpected log result type")
+            logged_err!(self.id; "unexpected log result type or failed truncate")
         }
     }
 }
@@ -1717,7 +1717,7 @@ impl GenericReplica for RSPaxosReplica {
                     if let Err(e) = self.handle_msg_recv(peer, msg) {
                         pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e);
                     }
-                }
+                },
 
                 // state machine execution result
                 cmd_result = self.state_machine.get_result(), if !paused => {
@@ -1733,13 +1733,17 @@ impl GenericReplica for RSPaxosReplica {
 
                 // leader inactivity timeout
                 _ = self.hb_hear_timer.timeout(), if !paused => {
-                    self.become_a_leader()?;
+                    if let Err(e) = self.become_a_leader() {
+                        pf_error!(self.id; "error becoming a leader: {}", e);
+                    }
                 },
 
                 // leader sending heartbeat
                 _ = self.hb_send_interval.tick(), if !paused && self.is_leader => {
-                    self.bcast_heartbeats()?;
-                }
+                    if let Err(e) = self.bcast_heartbeats() {
+                        pf_error!(self.id; "error broadcasting heartbeats: {}", e);
+                    }
+                },
 
                 // manager control message
                 ctrl_msg = self.control_hub.recv_ctrl() => {
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index 5d8baeec..ce89c7d1 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -721,7 +721,7 @@ impl GenericReplica for SimplePushReplica {
                         },
                     }
 
-                }
+                },
 
                 // state machine execution result
                 cmd_result = self.state_machine.get_result(), if !paused => {
diff --git a/src/server/transport.rs b/src/server/transport.rs
index 18caa475..6f2a2752 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -1,4 +1,9 @@
 //! Summerset server internal TCP transport module implementation.
+//!
+//! In concept, all messages are sent through unstable communication channels,
+//! and are retried if the sender did not receive an ACK in a timely manner.
+//! Here, we use TCP as the communication protocol to get the same effect of
+//! "every message a sender wants to send will eventually be delivered".
 
 use std::fmt;
 use std::net::SocketAddr;
diff --git a/summerset_client/src/clients/bench.rs b/summerset_client/src/clients/bench.rs
index 6b6ea18a..a2a7066e 100644
--- a/summerset_client/src/clients/bench.rs
+++ b/summerset_client/src/clients/bench.rs
@@ -19,7 +19,6 @@ use summerset::{
 
 lazy_static! {
     /// Pool of keys to choose from.
-    // TODO: enable using a dynamic pool of keys
     static ref KEYS_POOL: Vec<String> = {
         let mut pool = vec![];
         for _ in 0..5 {

From d7d091ab4278c28aac41bb495d4d8b77cb80324a Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 24 Sep 2023 13:00:15 -0500
Subject: [PATCH 54/89] staging progress on snapshotting

---
 src/protocols/multipaxos.rs | 248 ++++++++++++++++++++++++++----------
 src/server/statemach.rs     |  11 ++
 src/server/storage.rs       |  11 ++
 src/server/transport.rs     |  12 ++
 4 files changed, 217 insertions(+), 65 deletions(-)

diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 58fbaa84..22f02fc1 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -283,7 +283,7 @@ pub struct MultiPaxosReplica {
     commit_bar: usize,
 
     /// Index of the first non-executed instance.
-    /// It is always true that exec_bar <= commit_bar <= insts.len()
+    /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len()
     exec_bar: usize,
 
     /// Current durable log file offset.
@@ -388,9 +388,9 @@ impl MultiPaxosReplica {
 
         // create a new instance in the first null slot (or append a new one
         // at the end if no holes exist)
-        let mut slot = self.insts.len();
-        for s in self.commit_bar..self.insts.len() {
-            let old_inst = &mut self.insts[s];
+        let mut slot = self.start_slot + self.insts.len();
+        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
+            let old_inst = &mut self.insts[s - self.start_slot];
             if old_inst.status == Status::Null {
                 old_inst.reqs = req_batch.clone();
                 old_inst.leader_bk = Some(LeaderBookkeeping {
@@ -402,7 +402,7 @@ impl MultiPaxosReplica {
                 break;
             }
         }
-        if slot == self.insts.len() {
+        if slot == self.start_slot + self.insts.len() {
             let mut new_inst = self.null_instance();
             new_inst.reqs = req_batch.clone();
             new_inst.leader_bk = Some(LeaderBookkeeping {
@@ -425,7 +425,7 @@ impl MultiPaxosReplica {
                 self.bal_max_seen = self.bal_prep_sent;
             }
 
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             inst.bal = self.bal_prep_sent;
             inst.status = Status::Preparing;
             pf_debug!(self.id; "enter Prepare phase for slot {} bal {}",
@@ -457,7 +457,7 @@ impl MultiPaxosReplica {
                                slot, inst.bal);
         } else {
             // normal case: Prepare phase covered, only do the Accept phase
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             inst.bal = self.bal_prepared;
             inst.status = Status::Accepting;
             pf_debug!(self.id; "enter Accept phase for slot {} bal {}",
@@ -500,9 +500,12 @@ impl MultiPaxosReplica {
         &mut self,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}",
-                           slot, self.insts[slot].bal);
-        let inst = &self.insts[slot];
+                           slot, self.insts[slot - self.start_slot].bal);
+        let inst = &self.insts[slot - self.start_slot];
         let voted = if inst.voted.0 > 0 {
             Some(inst.voted.clone())
         } else {
@@ -539,9 +542,12 @@ impl MultiPaxosReplica {
         &mut self,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "finished AcceptData logging for slot {} bal {}",
-                           slot, self.insts[slot].bal);
-        let inst = &self.insts[slot];
+                           slot, self.insts[slot - self.start_slot].bal);
+        let inst = &self.insts[slot - self.start_slot];
 
         if self.is_leader {
             // on leader, finishing the logging of an AcceptData entry
@@ -572,14 +578,17 @@ impl MultiPaxosReplica {
         &mut self,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}",
-                           slot, self.insts[slot].bal);
-        assert!(self.insts[slot].status >= Status::Committed);
+                           slot, self.insts[slot - self.start_slot].bal);
+        assert!(self.insts[slot - self.start_slot].status >= Status::Committed);
 
         // update index of the first non-committed instance
         if slot == self.commit_bar {
-            while self.commit_bar < self.insts.len() {
-                let inst = &mut self.insts[self.commit_bar];
+            while self.commit_bar < self.start_slot + self.insts.len() {
+                let inst = &mut self.insts[self.commit_bar - self.start_slot];
                 if inst.status < Status::Committed {
                     break;
                 }
@@ -617,7 +626,10 @@ impl MultiPaxosReplica {
         log_result: LogResult<LogEntry>,
     ) -> Result<(), SummersetError> {
         let (slot, entry_type) = Self::split_log_action_id(action_id);
-        assert!(slot < self.insts.len());
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        assert!(slot < self.start_slot + self.insts.len());
 
         if let LogResult::Append { now_size } = log_result {
             assert!(now_size >= self.log_offset);
@@ -643,16 +655,19 @@ impl MultiPaxosReplica {
         slot: usize,
         ballot: Ballot,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received Prepare <- {} for slot {} bal {}",
                            peer, slot, ballot);
 
         // if ballot is not smaller than what I have seen:
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
-            while self.insts.len() <= slot {
+            while self.start_slot + self.insts.len() <= slot {
                 self.insts.push(self.null_instance());
             }
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             assert!(inst.bal <= ballot);
 
             inst.bal = ballot;
@@ -685,13 +700,16 @@ impl MultiPaxosReplica {
         ballot: Ballot,
         voted: Option<(Ballot, ReqBatch)>,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {}",
                            peer, slot, ballot);
 
         // if ballot is what I'm currently waiting on for Prepare replies:
         if ballot == self.bal_prep_sent {
-            assert!(slot < self.insts.len());
-            let inst = &mut self.insts[slot];
+            assert!(slot < self.start_slot + self.insts.len());
+            let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
             if (inst.status != Status::Preparing) || (ballot < inst.bal) {
@@ -766,16 +784,19 @@ impl MultiPaxosReplica {
         ballot: Ballot,
         reqs: ReqBatch,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received Accept <- {} for slot {} bal {}",
                            peer, slot, ballot);
 
         // if ballot is not smaller than what I have made promises for:
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
-            while self.insts.len() <= slot {
+            while self.start_slot + self.insts.len() <= slot {
                 self.insts.push(self.null_instance());
             }
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             assert!(inst.bal <= ballot);
 
             inst.bal = ballot;
@@ -809,13 +830,16 @@ impl MultiPaxosReplica {
         slot: usize,
         ballot: Ballot,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received AcceptReply <- {} for slot {} bal {}",
                            peer, slot, ballot);
 
         // if ballot is what I'm currently waiting on for Accept replies:
         if ballot == self.bal_prepared {
-            assert!(slot < self.insts.len());
-            let inst = &mut self.insts[slot];
+            assert!(slot < self.start_slot + self.insts.len());
+            let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
             if (inst.status != Status::Accepting) || (ballot < inst.bal) {
@@ -866,13 +890,16 @@ impl MultiPaxosReplica {
         peer: ReplicaId,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received Commit <- {} for slot {}", peer, slot);
 
         // locate instance in memory, filling in null instances if needed
-        while self.insts.len() <= slot {
+        while self.start_slot + self.insts.len() <= slot {
             self.insts.push(self.null_instance());
         }
-        let inst = &mut self.insts[slot];
+        let inst = &mut self.insts[slot - self.start_slot];
 
         // ignore spurious duplications
         if inst.status != Status::Accepting {
@@ -931,11 +958,14 @@ impl MultiPaxosReplica {
         cmd_result: CommandResult,
     ) -> Result<(), SummersetError> {
         let (slot, cmd_idx) = Self::split_command_id(cmd_id);
-        assert!(slot < self.insts.len());
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        assert!(slot < self.start_slot + self.insts.len());
         pf_trace!(self.id; "executed cmd in instance at slot {} idx {}",
                            slot, cmd_idx);
 
-        let inst = &mut self.insts[slot];
+        let inst = &mut self.insts[slot - self.start_slot];
         assert!(cmd_idx < inst.reqs.len());
         let (client, ref req) = inst.reqs[cmd_idx];
 
@@ -966,8 +996,8 @@ impl MultiPaxosReplica {
 
             // update index of the first non-executed instance
             if slot == self.exec_bar {
-                while self.exec_bar < self.insts.len() {
-                    let inst = &mut self.insts[self.exec_bar];
+                while self.exec_bar < self.start_slot + self.insts.len() {
+                    let inst = &mut self.insts[self.exec_bar - self.start_slot];
                     if inst.status < Status::Executed {
                         break;
                     }
@@ -1000,7 +1030,12 @@ impl MultiPaxosReplica {
         self.bal_max_seen = self.bal_prep_sent;
 
         // redo Prepare phase for all in-progress instances
-        for (slot, inst) in self.insts.iter_mut().enumerate() {
+        for (slot, inst) in self
+            .insts
+            .iter_mut()
+            .enumerate()
+            .map(|(s, i)| (self.start_slot + s, i))
+        {
             if inst.status < Status::Committed {
                 inst.bal = self.bal_prep_sent;
                 inst.status = Status::Preparing;
@@ -1210,11 +1245,11 @@ impl MultiPaxosReplica {
         match entry {
             LogEntry::PrepareBal { slot, ballot } => {
                 // locate instance in memory, filling in null instances if needed
-                while self.insts.len() <= slot {
+                while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance());
                 }
                 // update instance state
-                let inst = &mut self.insts[slot];
+                let inst = &mut self.insts[slot - self.start_slot];
                 inst.bal = ballot;
                 inst.status = Status::Preparing;
                 // update bal_prep_sent and bal_max_seen, reset bal_prepared
@@ -1229,11 +1264,11 @@ impl MultiPaxosReplica {
 
             LogEntry::AcceptData { slot, ballot, reqs } => {
                 // locate instance in memory, filling in null instances if needed
-                while self.insts.len() <= slot {
+                while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance());
                 }
                 // update instance state
-                let inst = &mut self.insts[slot];
+                let inst = &mut self.insts[slot - self.start_slot];
                 inst.bal = ballot;
                 inst.status = Status::Accepting;
                 inst.reqs = reqs.clone();
@@ -1249,14 +1284,15 @@ impl MultiPaxosReplica {
             }
 
             LogEntry::CommitSlot { slot } => {
-                assert!(slot < self.insts.len());
+                assert!(slot < self.start_slot + self.insts.len());
                 // update instance state
-                self.insts[slot].status = Status::Committed;
+                self.insts[slot - self.start_slot].status = Status::Committed;
                 // submit commands in contiguously committed instance to the
                 // state machine
                 if slot == self.commit_bar {
-                    while self.commit_bar < self.insts.len() {
-                        let inst = &mut self.insts[self.commit_bar];
+                    while self.commit_bar < self.start_slot + self.insts.len() {
+                        let inst =
+                            &mut self.insts[self.commit_bar - self.start_slot];
                         if inst.status < Status::Committed {
                             break;
                         }
@@ -1330,6 +1366,110 @@ impl MultiPaxosReplica {
         }
     }
 
+    /// Dump a new key-value pair to snapshot file.
+    async fn snapshot_dump_kv_pair(
+        &mut self,
+        key: String,
+        value: String,
+    ) -> Result<(), SummersetError> {
+        self.snapshot_hub.submit_action(
+            0, // using 0 as dummy log action ID
+            LogAction::Append {
+                entry: SnapEntry::NewKVPair { key, value },
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        if let LogResult::Write {
+            offset_ok: true,
+            now_size,
+        } = log_result
+        {
+            self.snap_offset = now_size;
+            Ok(())
+        } else {
+            logged_err!(
+                self.id;
+                "unexpected log result type or failed write"
+            )
+        }
+    }
+
+    /// Squash the durable WAL log, discarding everything older than start_slot.
+    async fn snapshot_squash_log(&mut self) -> Result<(), SummersetError> {
+        // read entries until one >= start_slot found
+        let mut cut_offset = 0;
+        loop {
+            self.storage_hub.submit_action(
+                0, // using 0 as dummy log action ID
+                LogAction::Read { offset: cut_offset },
+            )?;
+
+            let mut found = false;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id != 0 {
+                    // normal log action previously in queue; process it
+                    self.handle_log_result(action_id, log_result)?;
+                } else {
+                    match log_result {
+                        LogResult::Read {
+                            entry: Some(entry),
+                            end_offset,
+                        } => {
+                            let slot = match entry {
+                                LogEntry::PrepareBal { slot, .. } => slot,
+                                LogEntry::AcceptData { slot, .. } => slot,
+                                LogEntry::CommitSlot { slot } => slot,
+                            };
+                            if slot >= self.start_slot {
+                                // first entry >= start_slot found
+                                found = true;
+                            } else {
+                                // not found yet
+                                cut_offset = end_offset;
+                            }
+                        }
+                        LogResult::Read { entry: None, .. } => {
+                            // end of WAL log
+                            found = true;
+                        }
+                        _ => {
+                            return logged_err!(self.id; "unexpected log result type");
+                        }
+                    }
+                    break;
+                }
+            }
+
+            if found {
+                break;
+            }
+        }
+
+        // discard the log before cut_offset
+        if cut_offset > 0 {
+            self.storage_hub
+                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
+            let (_, log_result) = self.storage_hub.get_result().await?;
+            if let LogResult::Discard {
+                offset_ok: true,
+                now_size,
+            } = log_result
+            {
+                assert_eq!(self.log_offset - cut_offset, now_size);
+                self.log_offset = now_size;
+            } else {
+                return logged_err!(
+                    self.id;
+                    "unexpected log result type or failed discard"
+                );
+            }
+        }
+        Ok(())
+    }
+
     /// Take a snapshot up to current exec_idx, then discard the in-mem log up
     /// to that index, and squash the durable WAL log file.
     ///
@@ -1346,36 +1486,13 @@ impl MultiPaxosReplica {
         // dump all Puts in executed instances
         for slot in self.start_slot..self.exec_bar {
             let inst = &self.insts[slot - self.start_slot];
-            for (_, req) in &inst.reqs {
+            for (_, req) in inst.reqs.clone() {
                 if let ApiRequest::Req {
                     cmd: Command::Put { key, value },
                     ..
                 } = req
                 {
-                    self.snapshot_hub.submit_action(
-                        0, // using 0 as dummy log action ID
-                        LogAction::Append {
-                            entry: SnapEntry::NewKVPair {
-                                key: key.clone(),
-                                value: value.clone(),
-                            },
-                            sync: self.config.logger_sync,
-                        },
-                    )?;
-                    let (_, log_result) =
-                        self.snapshot_hub.get_result().await?;
-                    if let LogResult::Write {
-                        offset_ok: true,
-                        now_size,
-                    } = log_result
-                    {
-                        self.snap_offset = now_size;
-                    } else {
-                        return logged_err!(
-                            self.id;
-                            "unexpected log result type or failed write"
-                        );
-                    }
+                    self.snapshot_dump_kv_pair(key, value).await?;
                 }
             }
         }
@@ -1384,7 +1501,8 @@ impl MultiPaxosReplica {
         self.insts.drain(0..(self.exec_bar - self.start_slot));
         self.start_slot = self.exec_bar;
 
-        // TODO: squash the durable WAL log
+        // squash the durable WAL log, discarding everything older than start_slot
+        self.snapshot_squash_log().await?;
 
         pf_debug!(self.id; "took new snapshot up to: start {}", self.start_slot);
         Ok(())
diff --git a/src/server/statemach.rs b/src/server/statemach.rs
index 47196cf4..452b682e 100644
--- a/src/server/statemach.rs
+++ b/src/server/statemach.rs
@@ -94,6 +94,17 @@ impl StateMachine {
             None => logged_err!(self.me; "ack channel has been closed"),
         }
     }
+
+    /// Try to get the next execution result using `try_recv()`.
+    #[allow(dead_code)]
+    pub fn try_get_result(
+        &mut self,
+    ) -> Result<(CommandId, CommandResult), SummersetError> {
+        match self.rx_ack.try_recv() {
+            Ok((id, result)) => Ok((id, result)),
+            Err(e) => Err(SummersetError(e.to_string())),
+        }
+    }
 }
 
 // StateMachine executor thread implementation
diff --git a/src/server/storage.rs b/src/server/storage.rs
index d14bd2bf..a11d6ba6 100644
--- a/src/server/storage.rs
+++ b/src/server/storage.rs
@@ -182,6 +182,17 @@ where
             None => logged_err!(self.me; "ack channel has been closed"),
         }
     }
+
+    /// Try to get the next logging result using `try_recv()`.
+    #[allow(dead_code)]
+    pub fn try_get_result(
+        &mut self,
+    ) -> Result<(LogActionId, LogResult<Ent>), SummersetError> {
+        match self.rx_ack.try_recv() {
+            Ok((id, result)) => Ok((id, result)),
+            Err(e) => Err(SummersetError(e.to_string())),
+        }
+    }
 }
 
 // StorageHub logger thread implementation
diff --git a/src/server/transport.rs b/src/server/transport.rs
index 6f2a2752..699697d8 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -281,6 +281,18 @@ where
         }
     }
 
+    /// Try to receive the next message using `try_recv()`.
+    #[allow(dead_code)]
+    pub fn try_recv_msg(&mut self) -> Result<(ReplicaId, Msg), SummersetError> {
+        match self.rx_recv.try_recv() {
+            Ok((id, peer_msg)) => match peer_msg {
+                PeerMessage::Msg { msg } => Ok((id, msg)),
+                _ => logged_err!(self.me; "unexpected peer message type"),
+            },
+            Err(e) => Err(SummersetError(e.to_string())),
+        }
+    }
+
     /// Broadcasts leave notifications to all peers and waits for replies.
     pub async fn leave(&mut self) -> Result<(), SummersetError> {
         let tx_sends_guard = self.tx_sends.guard();

From cac7a52cf98c1b6f31cb143e7fc21483ab5467f7 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 24 Sep 2023 16:41:01 -0500
Subject: [PATCH 55/89] finished snapshotting impl for MultiPaxos

---
 scripts/local_client.py                |   7 +-
 scripts/local_cluster.py               |  11 +-
 src/manager/clusman.rs                 |   4 +
 src/protocols/multipaxos.rs            | 280 +++++++++++++------------
 summerset_client/src/clients/tester.rs |  70 ++++++-
 5 files changed, 225 insertions(+), 147 deletions(-)

diff --git a/scripts/local_client.py b/scripts/local_client.py
index f8ac5981..87e1d9eb 100644
--- a/scripts/local_client.py
+++ b/scripts/local_client.py
@@ -9,7 +9,7 @@ def do_cargo_build(release):
     if release:
         cmd.append("-r")
     proc = subprocess.Popen(cmd)
-    proc.wait()
+    return proc.wait()
 
 
 def run_process(cmd):
@@ -124,7 +124,10 @@ def run_client(protocol, utility, params, release, config):
     args = parser.parse_args()
 
     # build everything
-    do_cargo_build(args.release)
+    rc = do_cargo_build(args.release)
+    if rc != 0:
+        print("ERROR: cargo build failed")
+        sys.exit(rc)
 
     # run client executable
     client_proc = run_client(
diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index 5b0b0658..6d23db83 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -12,7 +12,7 @@ def do_cargo_build(release):
     if release:
         cmd.append("-r")
     proc = subprocess.Popen(cmd)
-    proc.wait()
+    return proc.wait()
 
 
 def run_process(cmd, capture_stderr=False):
@@ -169,12 +169,17 @@ def launch_servers(protocol, num_replicas, release, config):
     kill_all_matching("summerset_server", force=True)
     kill_all_matching("summerset_manager", force=True)
 
-    # remove all existing wal files
+    # remove all existing wal log & snapshot files
     for path in Path("/tmp").glob("summerset.*.wal"):
         path.unlink()
+    for path in Path("/tmp").glob("summerset.*.snap"):
+        path.unlink()
 
     # build everything
-    do_cargo_build(args.release)
+    rc = do_cargo_build(args.release)
+    if rc != 0:
+        print("ERROR: cargo build failed")
+        sys.exit(rc)
 
     # launch cluster manager oracle first
     manager_proc = launch_manager(args.protocol, args.num_replicas, args.release)
diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index 58b4a3b3..f1a496c9 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -12,6 +12,7 @@ use crate::client::ClientId;
 use crate::protocols::SmrProtocol;
 
 use tokio::sync::{mpsc, watch};
+use tokio::time::{self, Duration};
 
 /// Information about an active server.
 #[derive(Debug, Clone)]
@@ -358,6 +359,9 @@ impl ClusterManager {
                 return logged_err!("m"; "error assigning new server ID: {}", e);
             }
 
+            // wait a while to ensure the server's transport hub is setup
+            time::sleep(Duration::from_millis(300)).await;
+
             reset_done.insert(s);
         }
 
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 22f02fc1..a6123a8a 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -78,8 +78,8 @@ impl Default for ReplicaConfigMultiPaxos {
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.multipaxos.wal".into(),
             logger_sync: false,
-            hb_hear_timeout_min: 300,
-            hb_hear_timeout_max: 600,
+            hb_hear_timeout_min: 600,
+            hb_hear_timeout_max: 900,
             hb_send_interval_ms: 50,
             snapshot_path: "/tmp/summerset.multipaxos.snap".into(),
             snapshot_interval_s: 0,
@@ -152,6 +152,9 @@ struct Instance {
 
     /// True if from external client, else false.
     external: bool,
+
+    /// Offset of first durable WAL log entry related to this instance.
+    log_offset: usize,
 }
 
 /// Stable storage log entry type.
@@ -178,8 +181,8 @@ enum SnapEntry {
     /// this snapshot file == the start slot index of in-mem log.
     StartSlot { slot: usize },
 
-    /// Key-value pair entry to apply to the state.
-    NewKVPair { key: String, value: String },
+    /// Set of key-value pairs to apply to the state.
+    KVPairSet { pairs: HashMap<String, String> },
 }
 
 /// Peer-peer message type.
@@ -268,7 +271,7 @@ pub struct MultiPaxosReplica {
     start_slot: usize,
 
     /// Timer for taking a new autonomous snapshot.
-    snapshot_interval: Option<Interval>,
+    snapshot_interval: Interval,
 
     /// Largest ballot number that a leader has sent Prepare messages in.
     bal_prep_sent: Ballot,
@@ -304,6 +307,7 @@ impl MultiPaxosReplica {
             leader_bk: None,
             replica_bk: None,
             external: false,
+            log_offset: 0,
         }
     }
 
@@ -520,18 +524,18 @@ impl MultiPaxosReplica {
         } else {
             // on follower replica, finishing the logging of a
             // PrepareBal entry leads to sending back a Prepare reply
-            assert!(inst.replica_bk.is_some());
-            let source = inst.replica_bk.as_ref().unwrap().source;
-            self.transport_hub.send_msg(
-                PeerMsg::PrepareReply {
-                    slot,
-                    ballot: inst.bal,
-                    voted,
-                },
-                source,
-            )?;
-            pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}",
-                               source, slot, inst.bal);
+            if let Some(ReplicaBookkeeping { source }) = inst.replica_bk {
+                self.transport_hub.send_msg(
+                    PeerMsg::PrepareReply {
+                        slot,
+                        ballot: inst.bal,
+                        voted,
+                    },
+                    source,
+                )?;
+                pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}",
+                                   source, slot, inst.bal);
+            }
         }
 
         Ok(())
@@ -557,17 +561,17 @@ impl MultiPaxosReplica {
         } else {
             // on follower replica, finishing the logging of an
             // AcceptData entry leads to sending back an Accept reply
-            assert!(inst.replica_bk.is_some());
-            let source = inst.replica_bk.as_ref().unwrap().source;
-            self.transport_hub.send_msg(
-                PeerMsg::AcceptReply {
-                    slot,
-                    ballot: inst.bal,
-                },
-                source,
-            )?;
-            pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}",
-                               source, slot, inst.bal);
+            if let Some(ReplicaBookkeeping { source }) = inst.replica_bk {
+                self.transport_hub.send_msg(
+                    PeerMsg::AcceptReply {
+                        slot,
+                        ballot: inst.bal,
+                    },
+                    source,
+                )?;
+                pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}",
+                                   source, slot, inst.bal);
+            }
         }
 
         Ok(())
@@ -583,7 +587,6 @@ impl MultiPaxosReplica {
         }
         pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}",
                            slot, self.insts[slot - self.start_slot].bal);
-        assert!(self.insts[slot - self.start_slot].status >= Status::Committed);
 
         // update index of the first non-committed instance
         if slot == self.commit_bar {
@@ -633,6 +636,13 @@ impl MultiPaxosReplica {
 
         if let LogResult::Append { now_size } = log_result {
             assert!(now_size >= self.log_offset);
+            // update first log_offset of slot
+            let inst = &mut self.insts[slot - self.start_slot];
+            if inst.log_offset == 0 {
+                inst.log_offset = self.log_offset;
+            }
+            assert!(inst.log_offset <= self.log_offset);
+            // then update self.log_offset
             self.log_offset = now_size;
         } else {
             return logged_err!(self.id; "unexpected log result type: {:?}", log_result);
@@ -712,7 +722,10 @@ impl MultiPaxosReplica {
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if (inst.status != Status::Preparing) || (ballot < inst.bal) {
+            if !self.is_leader
+                || (inst.status != Status::Preparing)
+                || (ballot < inst.bal)
+            {
                 return Ok(());
             }
             assert_eq!(inst.bal, ballot);
@@ -842,7 +855,10 @@ impl MultiPaxosReplica {
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if (inst.status != Status::Accepting) || (ballot < inst.bal) {
+            if !self.is_leader
+                || (inst.status != Status::Accepting)
+                || (ballot < inst.bal)
+            {
                 return Ok(());
             }
             assert_eq!(inst.bal, ballot);
@@ -1039,6 +1055,11 @@ impl MultiPaxosReplica {
             if inst.status < Status::Committed {
                 inst.bal = self.bal_prep_sent;
                 inst.status = Status::Preparing;
+                inst.leader_bk = Some(LeaderBookkeeping {
+                    prepare_acks: Bitmap::new(self.population, false),
+                    prepare_max_bal: 0,
+                    accept_acks: Bitmap::new(self.population, false),
+                });
                 pf_debug!(self.id; "enter Prepare phase for slot {} bal {}",
                                    slot, inst.bal);
 
@@ -1132,6 +1153,8 @@ impl MultiPaxosReplica {
         &mut self,
         durable: bool,
     ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got restart req");
+
         // send leave notification to peers and wait for their replies
         self.transport_hub.leave().await?;
 
@@ -1273,6 +1296,11 @@ impl MultiPaxosReplica {
                 inst.status = Status::Accepting;
                 inst.reqs = reqs.clone();
                 inst.voted = (ballot, reqs);
+                // it could be the case that the PrepareBal action for this
+                // ballot has been snapshotted
+                if self.bal_prep_sent < ballot {
+                    self.bal_prep_sent = ballot;
+                }
                 // update bal_prepared and bal_max_seen
                 if self.bal_prepared < ballot {
                     self.bal_prepared = ballot;
@@ -1367,45 +1395,54 @@ impl MultiPaxosReplica {
     }
 
     /// Dump a new key-value pair to snapshot file.
-    async fn snapshot_dump_kv_pair(
-        &mut self,
-        key: String,
-        value: String,
-    ) -> Result<(), SummersetError> {
+    async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
+        // collect all key-value pairs put up to exec_bar
+        let mut pairs = HashMap::new();
+        for slot in self.start_slot..self.exec_bar {
+            let inst = &self.insts[slot - self.start_slot];
+            for (_, req) in inst.reqs.clone() {
+                if let ApiRequest::Req {
+                    cmd: Command::Put { key, value },
+                    ..
+                } = req
+                {
+                    pairs.insert(key, value);
+                }
+            }
+        }
+
+        // write the collection to snapshot file
         self.snapshot_hub.submit_action(
             0, // using 0 as dummy log action ID
             LogAction::Append {
-                entry: SnapEntry::NewKVPair { key, value },
+                entry: SnapEntry::KVPairSet { pairs },
                 sync: self.config.logger_sync,
             },
         )?;
         let (_, log_result) = self.snapshot_hub.get_result().await?;
-        if let LogResult::Write {
-            offset_ok: true,
-            now_size,
-        } = log_result
-        {
+        if let LogResult::Append { now_size } = log_result {
             self.snap_offset = now_size;
             Ok(())
         } else {
             logged_err!(
                 self.id;
-                "unexpected log result type or failed write"
+                "unexpected log result type"
             )
         }
     }
 
-    /// Squash the durable WAL log, discarding everything older than start_slot.
-    async fn snapshot_squash_log(&mut self) -> Result<(), SummersetError> {
-        // read entries until one >= start_slot found
-        let mut cut_offset = 0;
-        loop {
-            self.storage_hub.submit_action(
-                0, // using 0 as dummy log action ID
-                LogAction::Read { offset: cut_offset },
-            )?;
+    /// Discard everything older than start_slot in durable WAL log.
+    async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> {
+        let cut_offset = if !self.insts.is_empty() {
+            self.insts[0].log_offset
+        } else {
+            self.log_offset
+        };
 
-            let mut found = false;
+        // discard the log before cut_offset
+        if cut_offset > 0 {
+            self.storage_hub
+                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
             loop {
                 let (action_id, log_result) =
                     self.storage_hub.get_result().await?;
@@ -1413,98 +1450,70 @@ impl MultiPaxosReplica {
                     // normal log action previously in queue; process it
                     self.handle_log_result(action_id, log_result)?;
                 } else {
-                    match log_result {
-                        LogResult::Read {
-                            entry: Some(entry),
-                            end_offset,
-                        } => {
-                            let slot = match entry {
-                                LogEntry::PrepareBal { slot, .. } => slot,
-                                LogEntry::AcceptData { slot, .. } => slot,
-                                LogEntry::CommitSlot { slot } => slot,
-                            };
-                            if slot >= self.start_slot {
-                                // first entry >= start_slot found
-                                found = true;
-                            } else {
-                                // not found yet
-                                cut_offset = end_offset;
-                            }
-                        }
-                        LogResult::Read { entry: None, .. } => {
-                            // end of WAL log
-                            found = true;
-                        }
-                        _ => {
-                            return logged_err!(self.id; "unexpected log result type");
-                        }
+                    if let LogResult::Discard {
+                        offset_ok: true,
+                        now_size,
+                    } = log_result
+                    {
+                        assert_eq!(self.log_offset - cut_offset, now_size);
+                        self.log_offset = now_size;
+                    } else {
+                        return logged_err!(
+                            self.id;
+                            "unexpected log result type or failed discard"
+                        );
                     }
                     break;
                 }
             }
-
-            if found {
-                break;
-            }
         }
 
-        // discard the log before cut_offset
-        if cut_offset > 0 {
-            self.storage_hub
-                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
-            let (_, log_result) = self.storage_hub.get_result().await?;
-            if let LogResult::Discard {
-                offset_ok: true,
-                now_size,
-            } = log_result
-            {
-                assert_eq!(self.log_offset - cut_offset, now_size);
-                self.log_offset = now_size;
-            } else {
-                return logged_err!(
-                    self.id;
-                    "unexpected log result type or failed discard"
-                );
+        // update inst.log_offset for all remaining in-mem instances
+        for inst in &mut self.insts {
+            if inst.log_offset > 0 {
+                assert!(inst.log_offset >= cut_offset);
+                inst.log_offset -= cut_offset;
             }
         }
+
         Ok(())
     }
 
     /// Take a snapshot up to current exec_idx, then discard the in-mem log up
-    /// to that index, and squash the durable WAL log file.
+    /// to that index as well as outdate entries in the durable WAL log file.
     ///
     /// NOTE: the current implementation does not guard against crashes in the
     /// middle of taking a snapshot.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
-        pf_debug!(self.id; "taking a new snapshot: start {} exec {}",
+        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
                            self.start_slot, self.exec_bar);
         assert!(self.exec_bar >= self.start_slot);
         if self.exec_bar == self.start_slot {
             return Ok(());
         }
 
-        // dump all Puts in executed instances
-        for slot in self.start_slot..self.exec_bar {
-            let inst = &self.insts[slot - self.start_slot];
-            for (_, req) in inst.reqs.clone() {
-                if let ApiRequest::Req {
-                    cmd: Command::Put { key, value },
-                    ..
-                } = req
-                {
-                    self.snapshot_dump_kv_pair(key, value).await?;
-                }
-            }
+        // collect and dump all Puts in executed instances
+        if self.is_leader {
+            // NOTE: broadcast heartbeats here to appease followers
+            self.bcast_heartbeats()?;
         }
+        self.snapshot_dump_kv_pairs().await?;
 
         // update start_slot and discard all in-memory log instances up to exec_bar
         self.insts.drain(0..(self.exec_bar - self.start_slot));
         self.start_slot = self.exec_bar;
 
-        // squash the durable WAL log, discarding everything older than start_slot
-        self.snapshot_squash_log().await?;
+        // discarding everything older than start_slot in WAL log
+        if self.is_leader {
+            // NOTE: broadcast heartbeats here to appease followers
+            self.bcast_heartbeats()?;
+        }
+        self.snapshot_discard_log().await?;
 
-        pf_debug!(self.id; "took new snapshot up to: start {}", self.start_slot);
+        // reset the leader heartbeat hear timer
+        self.kickoff_hb_hear_timer()?;
+
+        pf_info!(self.id; "took snapshot up to: start {}", self.start_slot);
         Ok(())
     }
 
@@ -1539,13 +1548,17 @@ impl MultiPaxosReplica {
 
                     match log_result {
                         LogResult::Read {
-                            entry: Some(SnapEntry::NewKVPair { key, value }),
+                            entry: Some(SnapEntry::KVPairSet { pairs }),
                             end_offset,
                         } => {
-                            // execute a Put command on state machine
-                            self.state_machine
-                                .submit_cmd(0, Command::Put { key, value })?;
-                            let _ = self.state_machine.get_result().await?;
+                            // execute Put commands on state machine
+                            for (key, value) in pairs {
+                                self.state_machine.submit_cmd(
+                                    0,
+                                    Command::Put { key, value },
+                                )?;
+                                let _ = self.state_machine.get_result().await?;
+                            }
                             // update snapshot file offset
                             self.snap_offset = end_offset;
                         }
@@ -1615,6 +1628,7 @@ impl GenericReplica for MultiPaxosReplica {
                                     backer_path, logger_sync,
                                     hb_hear_timeout_min, hb_hear_timeout_max,
                                     hb_send_interval_ms,
+                                    snapshot_path, snapshot_interval_s,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
         if config.batch_interval_us == 0 {
@@ -1719,14 +1733,14 @@ impl GenericReplica for MultiPaxosReplica {
             time::interval(Duration::from_millis(config.hb_send_interval_ms));
         hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
-        let snapshot_interval = if config.snapshot_interval_s == 0 {
-            None
-        } else {
-            let mut si =
-                time::interval(Duration::from_secs(config.snapshot_interval_s));
-            si.set_missed_tick_behavior(MissedTickBehavior::Skip);
-            Some(si)
-        };
+        let mut snapshot_interval = time::interval(Duration::from_secs(
+            if config.snapshot_interval_s > 0 {
+                config.snapshot_interval_s
+            } else {
+                60 // dummy non-zero value to make `time::interval` happy
+            },
+        ));
+        snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
         Ok(MultiPaxosReplica {
             id,
@@ -1838,8 +1852,8 @@ impl GenericReplica for MultiPaxosReplica {
                 },
 
                 // autonomous snapshot taking timeout
-                _ = self.snapshot_interval.as_mut().unwrap().tick(), if !paused
-                        && self.snapshot_interval.is_some() => {
+                _ = self.snapshot_interval.tick(), if !paused
+                        && self.config.snapshot_interval_s > 0 => {
                     if let Err(e) = self.take_new_snapshot().await {
                         pf_error!(self.id; "error taking a new snapshot: {}", e);
                     } else {
@@ -1859,10 +1873,6 @@ impl GenericReplica for MultiPaxosReplica {
                     match self.handle_ctrl_msg(ctrl_msg, &mut paused).await {
                         Ok(terminate) => {
                             if let Some(restart) = terminate {
-                                pf_warn!(
-                                    self.id;
-                                    "server got {} req",
-                                    if restart { "restart" } else { "shutdown" });
                                 return Ok(restart);
                             }
                         },
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index 240a18ff..378256b7 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -29,10 +29,12 @@ lazy_static! {
         ("client_reconnect", true),
         ("non_leader_reset", true),
         ("leader_node_reset", true),
-        ("two_nodes_reset", true),
+        ("two_nodes_reset", false),
+        ("all_nodes_reset", false),
         ("non_leader_pause", false),
         ("leader_node_pause", false),
         ("node_pause_resume", false),
+        ("snapshot_reset", false),
     ];
 }
 
@@ -308,6 +310,28 @@ impl ClientTester {
         }
     }
 
+    /// Force some server(s) to take a new snapshot.
+    async fn force_snapshot(
+        &mut self,
+        servers: HashSet<ReplicaId>,
+    ) -> Result<(), SummersetError> {
+        let ctrl_stub = self.driver.ctrl_stub();
+
+        // send TakeSnapshot request to manager
+        let req = CtrlRequest::TakeSnapshot { servers };
+        let mut sent = ctrl_stub.send_req(Some(&req))?;
+        while !sent {
+            sent = ctrl_stub.send_req(None)?;
+        }
+
+        // wat for reply from manager
+        let reply = ctrl_stub.recv_reply().await?;
+        match reply {
+            CtrlReply::TakeSnapshot { .. } => Ok(()),
+            _ => logged_err!(self.driver.id; "unexpected control reply type"),
+        }
+    }
+
     /// Resume some server(s) in the cluster.
     #[allow(dead_code)]
     async fn resume_servers(
@@ -347,9 +371,11 @@ impl ClientTester {
             "non_leader_reset" => self.test_non_leader_reset().await,
             "leader_node_reset" => self.test_leader_node_reset().await,
             "two_nodes_reset" => self.test_two_nodes_reset().await,
+            "all_nodes_reset" => self.test_all_nodes_reset().await,
             "non_leader_pause" => self.test_non_leader_pause().await,
             "leader_node_pause" => self.test_leader_node_pause().await,
             "node_pause_resume" => self.test_node_pause_resume().await,
+            "snapshot_reset" => self.test_snapshot_reset().await,
             _ => {
                 return logged_err!(self.driver.id; "unrecognized test name '{}'",
                                                    name);
@@ -443,7 +469,7 @@ impl ClientTester {
             if !is_leader {
                 self.driver.leave(false).await?;
                 self.reset_servers(HashSet::from([s]), true).await?;
-                time::sleep(Duration::from_millis(500)).await;
+                time::sleep(Duration::from_secs(1)).await;
                 self.driver.connect().await?;
                 self.checked_get("Jose", Some(Some(&v)), 0).await?;
                 break;
@@ -460,7 +486,7 @@ impl ClientTester {
             if is_leader {
                 self.driver.leave(false).await?;
                 self.reset_servers(HashSet::from([s]), true).await?;
-                time::sleep(Duration::from_millis(500)).await;
+                time::sleep(Duration::from_secs(1)).await;
                 self.driver.connect().await?;
                 self.checked_get("Jose", Some(Some(&v)), 0).await?;
                 break;
@@ -491,18 +517,30 @@ impl ClientTester {
         if resets.len() == 2 {
             self.driver.leave(false).await?;
             self.reset_servers(resets, true).await?;
-            time::sleep(Duration::from_millis(500)).await;
+            time::sleep(Duration::from_secs(1)).await;
             self.driver.connect().await?;
             self.checked_get("Jose", Some(Some(&v)), 0).await?;
         }
         Ok(())
     }
 
+    /// All replica nodes crash and restart at the same time.
+    async fn test_all_nodes_reset(&mut self) -> Result<(), SummersetError> {
+        let v = Self::gen_rand_string(8);
+        self.checked_put("Jose", &v, Some(None), 0).await?;
+        self.driver.leave(false).await?;
+        self.reset_servers(HashSet::new(), true).await?;
+        time::sleep(Duration::from_secs(1)).await;
+        self.driver.connect().await?;
+        self.checked_get("Jose", Some(Some(&v)), 0).await?;
+        Ok(())
+    }
+
     /// Single non-leader replica node paused.
     async fn test_non_leader_pause(&mut self) -> Result<(), SummersetError> {
         let v0 = Self::gen_rand_string(8);
         self.checked_put("Jose", &v0, Some(None), 0).await?;
-        time::sleep(Duration::from_millis(300)).await;
+        time::sleep(Duration::from_millis(500)).await;
         for (s, is_leader) in self.query_servers().await? {
             if !is_leader {
                 self.driver.leave(false).await?;
@@ -522,7 +560,7 @@ impl ClientTester {
     async fn test_leader_node_pause(&mut self) -> Result<(), SummersetError> {
         let v0 = Self::gen_rand_string(8);
         self.checked_put("Jose", &v0, Some(None), 0).await?;
-        time::sleep(Duration::from_millis(300)).await;
+        time::sleep(Duration::from_millis(500)).await;
         for (s, is_leader) in self.query_servers().await? {
             if is_leader {
                 self.driver.leave(false).await?;
@@ -542,7 +580,7 @@ impl ClientTester {
     async fn test_node_pause_resume(&mut self) -> Result<(), SummersetError> {
         let v0 = Self::gen_rand_string(8);
         self.checked_put("Jose", &v0, Some(None), 0).await?;
-        time::sleep(Duration::from_millis(300)).await;
+        time::sleep(Duration::from_millis(500)).await;
         for (s, is_leader) in self.query_servers().await? {
             if is_leader {
                 self.driver.leave(false).await?;
@@ -574,4 +612,22 @@ impl ClientTester {
         }
         Ok(())
     }
+
+    /// Take snapshot and reset, check previously put key-value.
+    async fn test_snapshot_reset(&mut self) -> Result<(), SummersetError> {
+        let v0 = Self::gen_rand_string(8);
+        self.checked_put("Jose", &v0, Some(None), 0).await?;
+        let v1 = Self::gen_rand_string(8);
+        self.checked_put("Shawn", &v1, Some(None), 0).await?;
+        time::sleep(Duration::from_millis(500)).await;
+        self.force_snapshot(HashSet::new()).await?;
+        self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?;
+        self.driver.leave(false).await?;
+        self.reset_servers(HashSet::new(), true).await?;
+        time::sleep(Duration::from_secs(1)).await;
+        self.driver.connect().await?;
+        self.checked_get("Shawn", Some(Some(&v1)), 0).await?;
+        self.checked_get("Jose", Some(Some(&v1)), 0).await?;
+        Ok(())
+    }
 }

From 79d4e4b9e614f537c1913a59985864dbcfebb5a5 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 24 Sep 2023 17:40:22 -0500
Subject: [PATCH 56/89] finished snapshotting impl for RSPaxos

---
 README.md                   |   5 +-
 src/protocols/crossword.rs  |   4 +-
 src/protocols/multipaxos.rs |   2 +-
 src/protocols/rs_paxos.rs   | 582 +++++++++++++++++++++++++++++-------
 4 files changed, 483 insertions(+), 110 deletions(-)

diff --git a/README.md b/README.md
index a7250245..6a6224ce 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,8 @@ git push origin <PR_name>
 
 [![Format check](https://github.com/josehu07/summerset/actions/workflows/format.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Aformat)
 [![Build status](https://github.com/josehu07/summerset/actions/workflows/build.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Abuild)
-[![Tests status](https://github.com/josehu07/summerset/actions/workflows/tests.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests)
+[![Unit tests status](https://github.com/josehu07/summerset/actions/workflows/tests_unit.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests_unit)
+[![Proc tests status](https://github.com/josehu07/summerset/actions/workflows/tests_proc.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests_proc)
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 
 Summerset is a distributed key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added.
@@ -150,7 +151,7 @@ Complete cluster management and benchmarking scripts are available in another re
   - [x] client-side timeout/retry logic
   - [x] state persistence & restart check
   - [x] automatic leader election, backoffs
-  - [ ] snapshotting & garbage collection
+  - [x] snapshotting & garbage collection
   - [ ] specialize read-only commands?
   - [ ] separate commit vs. exec responses?
   - [ ] membership discovery & view changes
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 424b0cc8..a98bbacb 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -128,7 +128,7 @@ struct ReplicaBookkeeping {
     source: ReplicaId,
 }
 
-/// In-memory instance containing a complete commands batch.
+/// In-memory instance containing a (possibly partial) commands batch.
 #[derive(Debug, Clone)]
 struct Instance {
     /// Ballot number.
@@ -2013,7 +2013,7 @@ impl GenericEndpoint for CrosswordClient {
             }
 
             while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {}
-            pf_info!(self.id; "left current manager connection");
+            pf_info!(self.id; "left manager connection");
         }
 
         Ok(())
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index a6123a8a..d9d5cd76 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -638,7 +638,7 @@ impl MultiPaxosReplica {
             assert!(now_size >= self.log_offset);
             // update first log_offset of slot
             let inst = &mut self.insts[slot - self.start_slot];
-            if inst.log_offset == 0 {
+            if inst.log_offset == 0 || inst.log_offset > self.log_offset {
                 inst.log_offset = self.log_offset;
             }
             assert!(inst.log_offset <= self.log_offset);
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 6df73631..68e09acf 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -10,9 +10,9 @@ use std::net::SocketAddr;
 use crate::utils::{SummersetError, Bitmap, Timer, RSCodeword};
 use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
 use crate::server::{
-    ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi,
-    ApiRequest, ApiReply, StorageHub, LogAction, LogResult, LogActionId,
-    TransportHub, GenericReplica,
+    ReplicaId, ControlHub, StateMachine, Command, CommandResult, CommandId,
+    ExternalApi, ApiRequest, ApiReply, StorageHub, LogAction, LogResult,
+    LogActionId, TransportHub, GenericReplica,
 };
 use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint};
 use crate::protocols::SmrProtocol;
@@ -39,7 +39,7 @@ pub struct ReplicaConfigRSPaxos {
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
 
-    /// Path to backing file.
+    /// Path to backing log file.
     pub backer_path: String,
 
     /// Whether to call `fsync()`/`fdatasync()` on logger.
@@ -54,6 +54,13 @@ pub struct ReplicaConfigRSPaxos {
     /// Interval of leader sending heartbeats to followers.
     pub hb_send_interval_ms: u64,
 
+    /// Path to snapshot file.
+    pub snapshot_path: String,
+
+    /// Snapshot self-triggering interval in secs. 0 means never trigger
+    /// snapshotting autonomously.
+    pub snapshot_interval_s: u64,
+
     /// Fault-tolerance level.
     pub fault_tolerance: u8,
 
@@ -72,9 +79,11 @@ impl Default for ReplicaConfigRSPaxos {
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.rs_paxos.wal".into(),
             logger_sync: false,
-            hb_hear_timeout_min: 300,
-            hb_hear_timeout_max: 600,
+            hb_hear_timeout_min: 600,
+            hb_hear_timeout_max: 900,
             hb_send_interval_ms: 50,
+            snapshot_path: "/tmp/summerset.rs_paxos.snap".into(),
+            snapshot_interval_s: 0,
             fault_tolerance: 0,
             perf_storage_a: 0,
             perf_storage_b: 0,
@@ -122,7 +131,7 @@ struct ReplicaBookkeeping {
     source: ReplicaId,
 }
 
-/// In-memory instance containing a complete commands batch.
+/// In-memory instance containing a (possibly partial) commands batch.
 #[derive(Debug, Clone)]
 struct Instance {
     /// Ballot number.
@@ -145,6 +154,9 @@ struct Instance {
 
     /// True if from external client, else false.
     external: bool,
+
+    /// Offset of first durable WAL log entry related to this instance.
+    log_offset: usize,
 }
 
 /// Stable storage log entry type.
@@ -164,6 +176,17 @@ enum LogEntry {
     CommitSlot { slot: usize },
 }
 
+/// Snapshot file entry type.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
+enum SnapEntry {
+    /// First entry at the start of file: number of log instances covered by
+    /// this snapshot file == the start slot index of in-mem log.
+    StartSlot { slot: usize },
+
+    /// Set of key-value pairs to apply to the state.
+    KVPairSet { pairs: HashMap<String, String> },
+}
+
 /// Peer-peer message type.
 #[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
 enum PeerMsg {
@@ -192,11 +215,11 @@ enum PeerMsg {
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
 
-    /// Recovery read from new leader to replicas.
-    Recover { slot: usize },
+    /// Reconstruction read from new leader to replicas.
+    Reconstruct { slot: usize },
 
-    /// Recovery read reply from replica to leader.
-    RecoverReply {
+    /// Reconstruction read reply from replica to leader.
+    ReconstructReply {
         slot: usize,
         ballot: Ballot,
         reqs_cw: RSCodeword<ReqBatch>,
@@ -238,6 +261,9 @@ pub struct RSPaxosReplica {
     /// StorageHub module.
     storage_hub: StorageHub<LogEntry>,
 
+    /// StorageHub module for the snapshot file.
+    snapshot_hub: StorageHub<SnapEntry>,
+
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
@@ -253,6 +279,12 @@ pub struct RSPaxosReplica {
     /// In-memory log of instances.
     insts: Vec<Instance>,
 
+    /// Start slot index of in-mem log after latest snapshot.
+    start_slot: usize,
+
+    /// Timer for taking a new autonomous snapshot.
+    snapshot_interval: Interval,
+
     /// Largest ballot number that a leader has sent Prepare messages in.
     bal_prep_sent: Ballot,
 
@@ -266,12 +298,15 @@ pub struct RSPaxosReplica {
     commit_bar: usize,
 
     /// Index of the first non-executed instance.
-    /// It is always true that exec_bar <= commit_bar <= insts.len()
+    /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len()
     exec_bar: usize,
 
     /// Current durable log file offset.
     log_offset: usize,
 
+    /// Current durable snapshot file offset.
+    snap_offset: usize,
+
     /// Fixed Reed-Solomon coder.
     rs_coder: ReedSolomon,
 }
@@ -296,6 +331,7 @@ impl RSPaxosReplica {
             leader_bk: None,
             replica_bk: None,
             external: false,
+            log_offset: 0,
         })
     }
 
@@ -388,15 +424,15 @@ impl RSPaxosReplica {
 
         // create a new instance in the first null slot (or append a new one
         // at the end if no holes exist)
-        let mut slot = self.insts.len();
-        for s in self.commit_bar..self.insts.len() {
-            if self.insts[s].status == Status::Null {
+        let mut slot = self.start_slot + self.insts.len();
+        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
+            if self.insts[s - self.start_slot].status == Status::Null {
                 slot = s;
                 break;
             }
         }
-        if slot < self.insts.len() {
-            let old_inst = &mut self.insts[slot];
+        if slot < self.start_slot + self.insts.len() {
+            let old_inst = &mut self.insts[slot - self.start_slot];
             assert_eq!(old_inst.status, Status::Null);
             old_inst.reqs_cw = reqs_cw;
             old_inst.leader_bk = Some(LeaderBookkeeping {
@@ -427,7 +463,7 @@ impl RSPaxosReplica {
                 self.bal_max_seen = self.bal_prep_sent;
             }
 
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             inst.bal = self.bal_prep_sent;
             inst.status = Status::Preparing;
             pf_debug!(self.id; "enter Prepare phase for slot {} bal {}",
@@ -459,7 +495,7 @@ impl RSPaxosReplica {
                                slot, inst.bal);
         } else {
             // normal case: Prepare phase covered, only do the Accept phase
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             inst.bal = self.bal_prepared;
             inst.status = Status::Accepting;
             pf_debug!(self.id; "enter Accept phase for slot {} bal {}",
@@ -515,9 +551,12 @@ impl RSPaxosReplica {
         &mut self,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}",
-                           slot, self.insts[slot].bal);
-        let inst = &self.insts[slot];
+                           slot, self.insts[slot - self.start_slot].bal);
+        let inst = &self.insts[slot - self.start_slot];
         let voted = if inst.voted.0 > 0 {
             Some(inst.voted.clone())
         } else {
@@ -532,18 +571,18 @@ impl RSPaxosReplica {
         } else {
             // on follower replica, finishing the logging of a
             // PrepareBal entry leads to sending back a Prepare reply
-            assert!(inst.replica_bk.is_some());
-            let source = inst.replica_bk.as_ref().unwrap().source;
-            self.transport_hub.send_msg(
-                PeerMsg::PrepareReply {
-                    slot,
-                    ballot: inst.bal,
-                    voted,
-                },
-                source,
-            )?;
-            pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}",
-                               source, slot, inst.bal);
+            if let Some(ReplicaBookkeeping { source }) = inst.replica_bk {
+                self.transport_hub.send_msg(
+                    PeerMsg::PrepareReply {
+                        slot,
+                        ballot: inst.bal,
+                        voted,
+                    },
+                    source,
+                )?;
+                pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}",
+                                   source, slot, inst.bal);
+            }
         }
 
         Ok(())
@@ -554,9 +593,12 @@ impl RSPaxosReplica {
         &mut self,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "finished AcceptData logging for slot {} bal {}",
-                           slot, self.insts[slot].bal);
-        let inst = &self.insts[slot];
+                           slot, self.insts[slot - self.start_slot].bal);
+        let inst = &self.insts[slot - self.start_slot];
 
         if self.is_leader {
             // on leader, finishing the logging of an AcceptData entry
@@ -566,17 +608,17 @@ impl RSPaxosReplica {
         } else {
             // on follower replica, finishing the logging of an
             // AcceptData entry leads to sending back an Accept reply
-            assert!(inst.replica_bk.is_some());
-            let source = inst.replica_bk.as_ref().unwrap().source;
-            self.transport_hub.send_msg(
-                PeerMsg::AcceptReply {
-                    slot,
-                    ballot: inst.bal,
-                },
-                source,
-            )?;
-            pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}",
-                               source, slot, inst.bal);
+            if let Some(ReplicaBookkeeping { source }) = inst.replica_bk {
+                self.transport_hub.send_msg(
+                    PeerMsg::AcceptReply {
+                        slot,
+                        ballot: inst.bal,
+                    },
+                    source,
+                )?;
+                pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}",
+                                   source, slot, inst.bal);
+            }
         }
 
         Ok(())
@@ -587,14 +629,16 @@ impl RSPaxosReplica {
         &mut self,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}",
-                                   slot, self.insts[slot].bal);
-        assert!(self.insts[slot].status >= Status::Committed);
+                           slot, self.insts[slot - self.start_slot].bal);
 
         // update index of the first non-committed instance
         if slot == self.commit_bar {
-            while self.commit_bar < self.insts.len() {
-                let inst = &mut self.insts[self.commit_bar];
+            while self.commit_bar < self.start_slot + self.insts.len() {
+                let inst = &mut self.insts[self.commit_bar - self.start_slot];
                 if inst.status < Status::Committed {
                     break;
                 }
@@ -643,10 +687,20 @@ impl RSPaxosReplica {
         log_result: LogResult<LogEntry>,
     ) -> Result<(), SummersetError> {
         let (slot, entry_type) = Self::split_log_action_id(action_id);
-        assert!(slot < self.insts.len());
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        assert!(slot < self.start_slot + self.insts.len());
 
         if let LogResult::Append { now_size } = log_result {
             assert!(now_size >= self.log_offset);
+            // update first log_offset of slot
+            let inst = &mut self.insts[slot - self.start_slot];
+            if inst.log_offset == 0 || inst.log_offset > self.log_offset {
+                inst.log_offset = self.log_offset;
+            }
+            assert!(inst.log_offset <= self.log_offset);
+            // then update self.log_offset
             self.log_offset = now_size;
         } else {
             return logged_err!(self.id; "unexpected log result type: {:?}", log_result);
@@ -669,16 +723,19 @@ impl RSPaxosReplica {
         slot: usize,
         ballot: Ballot,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received Prepare <- {} for slot {} bal {}",
                            peer, slot, ballot);
 
         // if ballot is not smaller than what I have seen:
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
-            while self.insts.len() <= slot {
+            while self.start_slot + self.insts.len() <= slot {
                 self.insts.push(self.null_instance()?);
             }
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             assert!(inst.bal <= ballot);
 
             inst.bal = ballot;
@@ -711,17 +768,23 @@ impl RSPaxosReplica {
         ballot: Ballot,
         voted: Option<(Ballot, RSCodeword<ReqBatch>)>,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}",
                            peer, slot, ballot,
                            voted.as_ref().map(|(_, cw)| cw.avail_shards_map()));
 
         // if ballot is what I'm currently waiting on for Prepare replies:
         if ballot == self.bal_prep_sent {
-            assert!(slot < self.insts.len());
-            let inst = &mut self.insts[slot];
+            assert!(slot < self.start_slot + self.insts.len());
+            let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if (inst.status != Status::Preparing) || (ballot < inst.bal) {
+            if !self.is_leader
+                || (inst.status != Status::Preparing)
+                || (ballot < inst.bal)
+            {
                 return Ok(());
             }
             assert_eq!(inst.bal, ballot);
@@ -826,16 +889,19 @@ impl RSPaxosReplica {
         ballot: Ballot,
         reqs_cw: RSCodeword<ReqBatch>,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}",
                            peer, slot, ballot, reqs_cw.avail_shards_map());
 
         // if ballot is not smaller than what I have made promises for:
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
-            while self.insts.len() <= slot {
+            while self.start_slot + self.insts.len() <= slot {
                 self.insts.push(self.null_instance()?);
             }
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             assert!(inst.bal <= ballot);
 
             inst.bal = ballot;
@@ -873,16 +939,22 @@ impl RSPaxosReplica {
         slot: usize,
         ballot: Ballot,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received AcceptReply <- {} for slot {} bal {}",
                            peer, slot, ballot);
 
         // if ballot is what I'm currently waiting on for Accept replies:
         if ballot == self.bal_prepared {
-            assert!(slot < self.insts.len());
-            let inst = &mut self.insts[slot];
+            assert!(slot < self.start_slot + self.insts.len());
+            let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if (inst.status != Status::Accepting) || (ballot < inst.bal) {
+            if !self.is_leader
+                || (inst.status != Status::Accepting)
+                || (ballot < inst.bal)
+            {
                 return Ok(());
             }
             assert_eq!(inst.bal, ballot);
@@ -934,13 +1006,16 @@ impl RSPaxosReplica {
         peer: ReplicaId,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received Commit <- {} for slot {}", peer, slot);
 
         // locate instance in memory, filling in null instances if needed
-        while self.insts.len() <= slot {
+        while self.start_slot + self.insts.len() <= slot {
             self.insts.push(self.null_instance()?);
         }
-        let inst = &mut self.insts[slot];
+        let inst = &mut self.insts[slot - self.start_slot];
 
         // ignore spurious duplications
         if inst.status != Status::Accepting {
@@ -966,19 +1041,22 @@ impl RSPaxosReplica {
         Ok(())
     }
 
-    /// Handler of Recover message from leader.
-    fn handle_msg_recover(
+    /// Handler of Reconstruct message from leader.
+    fn handle_msg_reconstruct(
         &mut self,
         peer: ReplicaId,
         slot: usize,
     ) -> Result<(), SummersetError> {
-        pf_trace!(self.id; "received Recover <- {} for slot {}", peer, slot);
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        pf_trace!(self.id; "received Reconstruct <- {} for slot {}", peer, slot);
 
         // locate instance in memory, filling in null instances if needed
-        while self.insts.len() <= slot {
+        while self.start_slot + self.insts.len() <= slot {
             self.insts.push(self.null_instance()?);
         }
-        let inst = &mut self.insts[slot];
+        let inst = &mut self.insts[slot - self.start_slot];
 
         // ignore spurious duplications; also ignore if I have nothing to send back
         if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 {
@@ -987,32 +1065,35 @@ impl RSPaxosReplica {
 
         // send back my ballot for this slot and the available shards
         self.transport_hub.send_msg(
-            PeerMsg::RecoverReply {
+            PeerMsg::ReconstructReply {
                 slot,
                 ballot: inst.bal,
                 reqs_cw: inst.reqs_cw.clone(),
             },
             peer,
         )?;
-        pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}", slot, inst.bal);
+        pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}", slot, inst.bal);
 
         Ok(())
     }
 
-    /// Handler of Recover reply from replica.
-    fn handle_msg_recover_reply(
+    /// Handler of Reconstruct reply from replica.
+    fn handle_msg_reconstruct_reply(
         &mut self,
         peer: ReplicaId,
         slot: usize,
         ballot: Ballot,
         reqs_cw: RSCodeword<ReqBatch>,
     ) -> Result<(), SummersetError> {
-        pf_trace!(self.id; "received RecoverReply <- {} for slot {} bal {} shards {:?}",
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        pf_trace!(self.id; "received ReconstructReply <- {} for slot {} bal {} shards {:?}",
                            peer, slot, ballot, reqs_cw.avail_shards_map());
-        assert!(slot < self.insts.len());
-        assert!(self.insts[slot].status >= Status::Committed);
-        let num_insts = self.insts.len();
-        let inst = &mut self.insts[slot];
+        assert!(slot < self.start_slot + self.insts.len());
+        assert!(self.insts[slot - self.start_slot].status >= Status::Committed);
+        let num_insts = self.start_slot + self.insts.len();
+        let inst = &mut self.insts[slot - self.start_slot];
 
         // if reply not outdated and ballot is up-to-date
         if inst.status < Status::Executed && ballot >= inst.bal {
@@ -1022,7 +1103,8 @@ impl RSPaxosReplica {
             // if enough shards have been gathered, can push execution forward
             if slot == self.commit_bar {
                 while self.commit_bar < num_insts {
-                    let inst = &mut self.insts[self.commit_bar];
+                    let inst =
+                        &mut self.insts[self.commit_bar - self.start_slot];
                     if inst.status < Status::Committed
                         || inst.reqs_cw.avail_shards() < self.quorum_cnt
                     {
@@ -1089,12 +1171,14 @@ impl RSPaxosReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
-            PeerMsg::Recover { slot } => self.handle_msg_recover(peer, slot),
-            PeerMsg::RecoverReply {
+            PeerMsg::Reconstruct { slot } => {
+                self.handle_msg_reconstruct(peer, slot)
+            }
+            PeerMsg::ReconstructReply {
                 slot,
                 ballot,
                 reqs_cw,
-            } => self.handle_msg_recover_reply(peer, slot, ballot, reqs_cw),
+            } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw),
             PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot),
         }
     }
@@ -1106,11 +1190,14 @@ impl RSPaxosReplica {
         cmd_result: CommandResult,
     ) -> Result<(), SummersetError> {
         let (slot, cmd_idx) = Self::split_command_id(cmd_id);
-        assert!(slot < self.insts.len());
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        assert!(slot < self.start_slot + self.insts.len());
         pf_trace!(self.id; "executed cmd in instance at slot {} idx {}",
                            slot, cmd_idx);
 
-        let inst = &mut self.insts[slot];
+        let inst = &mut self.insts[slot - self.start_slot];
         let reqs = inst.reqs_cw.get_data()?;
         assert!(cmd_idx < reqs.len());
         let (client, ref req) = reqs[cmd_idx];
@@ -1142,8 +1229,8 @@ impl RSPaxosReplica {
 
             // update index of the first non-executed instance
             if slot == self.exec_bar {
-                while self.exec_bar < self.insts.len() {
-                    let inst = &mut self.insts[self.exec_bar];
+                while self.exec_bar < self.start_slot + self.insts.len() {
+                    let inst = &mut self.insts[self.exec_bar - self.start_slot];
                     if inst.status < Status::Executed {
                         break;
                     }
@@ -1175,11 +1262,21 @@ impl RSPaxosReplica {
         self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
         self.bal_max_seen = self.bal_prep_sent;
 
-        for (slot, inst) in self.insts.iter_mut().enumerate() {
+        for (slot, inst) in self
+            .insts
+            .iter_mut()
+            .enumerate()
+            .map(|(s, i)| (self.start_slot + s, i))
+        {
             // redo Prepare phase for all in-progress instances
             if inst.status < Status::Committed {
                 inst.bal = self.bal_prep_sent;
                 inst.status = Status::Preparing;
+                inst.leader_bk = Some(LeaderBookkeeping {
+                    prepare_acks: Bitmap::new(self.population, false),
+                    prepare_max_bal: 0,
+                    accept_acks: Bitmap::new(self.population, false),
+                });
                 pf_debug!(self.id; "enter Prepare phase for slot {} bal {}",
                                    slot, inst.bal);
 
@@ -1209,14 +1306,14 @@ impl RSPaxosReplica {
                                    slot, inst.bal);
             }
 
-            // do recovery reads for all committed instances that do not
+            // do reconstruction reads for all committed instances that do not
             // hold enough available shards for reconstruction
             if inst.status == Status::Committed
                 && inst.reqs_cw.avail_shards() < self.quorum_cnt
             {
                 self.transport_hub
-                    .bcast_msg(PeerMsg::Recover { slot }, None)?;
-                pf_trace!(self.id; "broadcast Recover messages for slot {} bal {} shards {:?}",
+                    .bcast_msg(PeerMsg::Reconstruct { slot }, None)?;
+                pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}",
                                    slot, inst.bal, inst.reqs_cw.avail_shards_map());
             }
         }
@@ -1284,6 +1381,8 @@ impl RSPaxosReplica {
         &mut self,
         durable: bool,
     ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got restart req");
+
         // send leave notification to peers and wait for their replies
         self.transport_hub.leave().await?;
 
@@ -1343,6 +1442,19 @@ impl RSPaxosReplica {
         Ok(())
     }
 
+    /// Handler of TakeSnapshot control message.
+    async fn handle_ctrl_take_snapshot(
+        &mut self,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server told to take snapshot");
+        self.take_new_snapshot().await?;
+
+        self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
+            new_start: self.start_slot,
+        })?;
+        Ok(())
+    }
+
     /// Synthesized handler of manager control messages. If ok, returns
     /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
     /// decides to shutdown completely, and `None` if not terminating.
@@ -1367,6 +1479,11 @@ impl RSPaxosReplica {
                 Ok(None)
             }
 
+            CtrlMsg::TakeSnapshot => {
+                self.handle_ctrl_take_snapshot().await?;
+                Ok(None)
+            }
+
             _ => Ok(None), // ignore all other types
         }
     }
@@ -1379,11 +1496,11 @@ impl RSPaxosReplica {
         match entry {
             LogEntry::PrepareBal { slot, ballot } => {
                 // locate instance in memory, filling in null instances if needed
-                while self.insts.len() <= slot {
+                while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance()?);
                 }
                 // update instance state
-                let inst = &mut self.insts[slot];
+                let inst = &mut self.insts[slot - self.start_slot];
                 inst.bal = ballot;
                 inst.status = Status::Preparing;
                 // update bal_prep_sent and bal_max_seen, reset bal_prepared
@@ -1402,15 +1519,20 @@ impl RSPaxosReplica {
                 reqs_cw,
             } => {
                 // locate instance in memory, filling in null instances if needed
-                while self.insts.len() <= slot {
+                while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance()?);
                 }
                 // update instance state
-                let inst = &mut self.insts[slot];
+                let inst = &mut self.insts[slot - self.start_slot];
                 inst.bal = ballot;
                 inst.status = Status::Accepting;
                 inst.reqs_cw = reqs_cw.clone();
                 inst.voted = (ballot, reqs_cw);
+                // it could be the case that the PrepareBal action for this
+                // ballot has been snapshotted
+                if self.bal_prep_sent < ballot {
+                    self.bal_prep_sent = ballot;
+                }
                 // update bal_prepared and bal_max_seen
                 if self.bal_prepared < ballot {
                     self.bal_prepared = ballot;
@@ -1422,14 +1544,15 @@ impl RSPaxosReplica {
             }
 
             LogEntry::CommitSlot { slot } => {
-                assert!(slot < self.insts.len());
+                assert!(slot < self.start_slot + self.insts.len());
                 // update instance state
-                self.insts[slot].status = Status::Committed;
+                self.insts[slot - self.start_slot].status = Status::Committed;
                 // submit commands in contiguously committed instance to the
                 // state machine
                 if slot == self.commit_bar {
-                    while self.commit_bar < self.insts.len() {
-                        let inst = &mut self.insts[self.commit_bar];
+                    while self.commit_bar < self.start_slot + self.insts.len() {
+                        let inst =
+                            &mut self.insts[self.commit_bar - self.start_slot];
                         if inst.status < Status::Committed {
                             break;
                         }
@@ -1513,6 +1636,221 @@ impl RSPaxosReplica {
             logged_err!(self.id; "unexpected log result type or failed truncate")
         }
     }
+
+    /// Dump a new key-value pair to snapshot file.
+    async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
+        // collect all key-value pairs put up to exec_bar
+        let mut pairs = HashMap::new();
+        for slot in self.start_slot..self.exec_bar {
+            let inst = &mut self.insts[slot - self.start_slot];
+            assert!(inst.reqs_cw.avail_data_shards() >= self.quorum_cnt);
+            for (_, req) in inst.reqs_cw.get_data()?.clone() {
+                if let ApiRequest::Req {
+                    cmd: Command::Put { key, value },
+                    ..
+                } = req
+                {
+                    pairs.insert(key, value);
+                }
+            }
+        }
+
+        // write the collection to snapshot file
+        self.snapshot_hub.submit_action(
+            0, // using 0 as dummy log action ID
+            LogAction::Append {
+                entry: SnapEntry::KVPairSet { pairs },
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        if let LogResult::Append { now_size } = log_result {
+            self.snap_offset = now_size;
+            Ok(())
+        } else {
+            logged_err!(
+                self.id;
+                "unexpected log result type"
+            )
+        }
+    }
+
+    /// Discard everything older than start_slot in durable WAL log.
+    async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> {
+        let cut_offset = if !self.insts.is_empty() {
+            self.insts[0].log_offset
+        } else {
+            self.log_offset
+        };
+
+        // discard the log before cut_offset
+        if cut_offset > 0 {
+            self.storage_hub
+                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id != 0 {
+                    // normal log action previously in queue; process it
+                    self.handle_log_result(action_id, log_result)?;
+                } else {
+                    if let LogResult::Discard {
+                        offset_ok: true,
+                        now_size,
+                    } = log_result
+                    {
+                        assert_eq!(self.log_offset - cut_offset, now_size);
+                        self.log_offset = now_size;
+                    } else {
+                        return logged_err!(
+                            self.id;
+                            "unexpected log result type or failed discard"
+                        );
+                    }
+                    break;
+                }
+            }
+        }
+
+        // update inst.log_offset for all remaining in-mem instances
+        for inst in &mut self.insts {
+            if inst.log_offset > 0 {
+                assert!(inst.log_offset >= cut_offset);
+                inst.log_offset -= cut_offset;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Take a snapshot up to current exec_idx, then discard the in-mem log up
+    /// to that index as well as outdate entries in the durable WAL log file.
+    ///
+    /// NOTE: the current implementation does not guard against crashes in the
+    /// middle of taking a snapshot.
+    async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
+        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
+                           self.start_slot, self.exec_bar);
+        assert!(self.exec_bar >= self.start_slot);
+        if self.exec_bar == self.start_slot {
+            return Ok(());
+        }
+
+        // collect and dump all Puts in executed instances
+        if self.is_leader {
+            // NOTE: broadcast heartbeats here to appease followers
+            self.bcast_heartbeats()?;
+        }
+        self.snapshot_dump_kv_pairs().await?;
+
+        // update start_slot and discard all in-memory log instances up to exec_bar
+        self.insts.drain(0..(self.exec_bar - self.start_slot));
+        self.start_slot = self.exec_bar;
+
+        // discarding everything older than start_slot in WAL log
+        if self.is_leader {
+            // NOTE: broadcast heartbeats here to appease followers
+            self.bcast_heartbeats()?;
+        }
+        self.snapshot_discard_log().await?;
+
+        // reset the leader heartbeat hear timer
+        self.kickoff_hb_hear_timer()?;
+
+        pf_info!(self.id; "took snapshot up to: start {}", self.start_slot);
+        Ok(())
+    }
+
+    /// Recover initial state from durable storage snapshot file.
+    async fn recover_from_snapshot(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.snap_offset, 0);
+
+        // first, try to read the first several bytes, which should record the
+        // start_slot index
+        self.snapshot_hub
+            .submit_action(0, LogAction::Read { offset: 0 })?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+
+        match log_result {
+            LogResult::Read {
+                entry: Some(SnapEntry::StartSlot { slot }),
+                end_offset,
+            } => {
+                self.snap_offset = end_offset;
+                self.start_slot = slot; // get start slot index of in-mem log
+
+                // repeatedly apply key-value pairs
+                loop {
+                    self.snapshot_hub.submit_action(
+                        0,
+                        LogAction::Read {
+                            offset: self.snap_offset,
+                        },
+                    )?;
+                    let (_, log_result) =
+                        self.snapshot_hub.get_result().await?;
+
+                    match log_result {
+                        LogResult::Read {
+                            entry: Some(SnapEntry::KVPairSet { pairs }),
+                            end_offset,
+                        } => {
+                            // execute Put commands on state machine
+                            for (key, value) in pairs {
+                                self.state_machine.submit_cmd(
+                                    0,
+                                    Command::Put { key, value },
+                                )?;
+                                let _ = self.state_machine.get_result().await?;
+                            }
+                            // update snapshot file offset
+                            self.snap_offset = end_offset;
+                        }
+                        LogResult::Read { entry: None, .. } => {
+                            // end of log reached
+                            break;
+                        }
+                        _ => {
+                            return logged_err!(self.id; "unexpected log result type");
+                        }
+                    }
+                }
+
+                // tell manager about my start_slot index
+                self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
+                    new_start: self.start_slot,
+                })?;
+                Ok(())
+            }
+
+            LogResult::Read { entry: None, .. } => {
+                // snapshot file is empty. Write a 0 as start_slot and return
+                self.snapshot_hub.submit_action(
+                    0,
+                    LogAction::Write {
+                        entry: SnapEntry::StartSlot { slot: 0 },
+                        offset: 0,
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                let (_, log_result) = self.snapshot_hub.get_result().await?;
+                if let LogResult::Write {
+                    offset_ok: true,
+                    now_size,
+                } = log_result
+                {
+                    self.snap_offset = now_size;
+                    Ok(())
+                } else {
+                    logged_err!(self.id; "unexpected log result type or failed truncate")
+                }
+            }
+
+            _ => {
+                logged_err!(self.id; "unexpected log result type")
+            }
+        }
+    }
 }
 
 #[async_trait]
@@ -1533,7 +1871,9 @@ impl GenericReplica for RSPaxosReplica {
                                     batch_interval_us, max_batch_size,
                                     backer_path, logger_sync,
                                     hb_hear_timeout_min, hb_hear_timeout_max,
-                                    hb_send_interval_ms, fault_tolerance,
+                                    hb_send_interval_ms,
+                                    snapshot_path, snapshot_interval_s,
+                                    fault_tolerance,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
         if config.batch_interval_us == 0 {
@@ -1629,6 +1969,14 @@ impl GenericReplica for RSPaxosReplica {
         }
         transport_hub.wait_for_group(population).await?;
 
+        // setup snapshot hub module
+        let snapshot_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.snapshot_path),
+            None,
+        )
+        .await?;
+
         // setup external API module, ready to take in client requests
         let external_api = ExternalApi::new_and_setup(
             id,
@@ -1642,6 +1990,15 @@ impl GenericReplica for RSPaxosReplica {
             time::interval(Duration::from_millis(config.hb_send_interval_ms));
         hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
+        let mut snapshot_interval = time::interval(Duration::from_secs(
+            if config.snapshot_interval_s > 0 {
+                config.snapshot_interval_s
+            } else {
+                60 // dummy non-zero value to make `time::interval` happy
+            },
+        ));
+        snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
         Ok(RSPaxosReplica {
             id,
             population,
@@ -1653,17 +2010,21 @@ impl GenericReplica for RSPaxosReplica {
             external_api,
             state_machine,
             storage_hub,
+            snapshot_hub,
             transport_hub,
             hb_hear_timer: Timer::new(),
             hb_send_interval,
             is_leader: false,
             insts: vec![],
+            start_slot: 0,
+            snapshot_interval,
             bal_prep_sent: 0,
             bal_prepared: 0,
             bal_max_seen: 0,
             commit_bar: 0,
             exec_bar: 0,
             log_offset: 0,
+            snap_offset: 0,
             rs_coder,
         })
     }
@@ -1672,7 +2033,10 @@ impl GenericReplica for RSPaxosReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
-        // recover state from durable storage log
+        // recover state from durable snapshot file
+        self.recover_from_snapshot().await?;
+
+        // recover the tail-piece memory log & state from durable storage log
         self.recover_from_log().await?;
 
         // kick off leader activity hearing timer
@@ -1745,6 +2109,18 @@ impl GenericReplica for RSPaxosReplica {
                     }
                 },
 
+                // autonomous snapshot taking timeout
+                _ = self.snapshot_interval.tick(), if !paused
+                        && self.config.snapshot_interval_s > 0 => {
+                    if let Err(e) = self.take_new_snapshot().await {
+                        pf_error!(self.id; "error taking a new snapshot: {}", e);
+                    } else {
+                        self.control_hub.send_ctrl(
+                            CtrlMsg::SnapshotUpTo { new_start: self.start_slot }
+                        )?;
+                    }
+                },
+
                 // manager control message
                 ctrl_msg = self.control_hub.recv_ctrl() => {
                     if let Err(e) = ctrl_msg {
@@ -1755,10 +2131,6 @@ impl GenericReplica for RSPaxosReplica {
                     match self.handle_ctrl_msg(ctrl_msg, &mut paused).await {
                         Ok(terminate) => {
                             if let Some(restart) = terminate {
-                                pf_warn!(
-                                    self.id;
-                                    "server got {} req",
-                                    if restart { "restart" } else { "shutdown" });
                                 return Ok(restart);
                             }
                         },
@@ -1885,7 +2257,7 @@ impl GenericEndpoint for RSPaxosClient {
     }
 
     async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> {
-        // send leave notification to current connected server
+        // send leave notification to all servers
         for (id, mut api_stub) in self.api_stubs.drain() {
             let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?;
             while !sent {
@@ -1906,7 +2278,7 @@ impl GenericEndpoint for RSPaxosClient {
             }
 
             while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {}
-            pf_info!(self.id; "left current manager connection");
+            pf_info!(self.id; "left manager connection");
         }
 
         Ok(())

From c77fc742effd39ae938dc97c10c682eb81767e5b Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 24 Sep 2023 17:55:43 -0500
Subject: [PATCH 57/89] finished snapshotting impl for RSPaxos

---
 src/protocols/crossword.rs | 3 ++-
 src/protocols/rs_paxos.rs  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index a98bbacb..02f26853 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -1094,7 +1094,8 @@ impl CrosswordReplica {
             },
             peer,
         )?;
-        pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}", slot, inst.bal);
+        pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}",
+                           slot, inst.bal);
 
         Ok(())
     }
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 68e09acf..4aa5a81f 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -1072,7 +1072,8 @@ impl RSPaxosReplica {
             },
             peer,
         )?;
-        pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}", slot, inst.bal);
+        pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}",
+                           slot, inst.bal);
 
         Ok(())
     }

From 693d26ad88622b99e821944baf95921feaa4bb43 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 24 Sep 2023 22:02:47 -0500
Subject: [PATCH 58/89] staging progress on snapshotting

---
 scripts/local_cluster.py   |   2 +
 src/protocols/crossword.rs | 576 ++++++++++++++++++++++++++++++-------
 2 files changed, 476 insertions(+), 102 deletions(-)

diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index 6d23db83..a088ed81 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -50,6 +50,8 @@ def kill_all_matching(name, force=False):
 
 PROTOCOL_SNAPSHOT_PATH = {
     "MultiPaxos": lambda r: f"snapshot_path='/tmp/summerset.multipaxos.{r}.snap'",
+    "RSPaxos": lambda r: f"snapshot_path='/tmp/summerset.rs_paxos.{r}.snap'",
+    "Crossword": lambda r: f"snapshot_path='/tmp/summerset.crossword.{r}.snap'",
 }
 
 
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 02f26853..f8dda64d 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -1,7 +1,8 @@
 //! Replication protocol: Crossword.
 //!
 //! MultiPaxos with flexible Reed-Solomon erasure coding that supports tunable
-//! shard groups and asymmetric shard assignment.
+//! shard groups, asymmetric shard assignment, and follower gossiping for actual
+//! usability.
 
 use std::collections::HashMap;
 use std::path::Path;
@@ -10,9 +11,9 @@ use std::net::SocketAddr;
 use crate::utils::{SummersetError, Bitmap, Timer, RSCodeword};
 use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
 use crate::server::{
-    ReplicaId, ControlHub, StateMachine, CommandResult, CommandId, ExternalApi,
-    ApiRequest, ApiReply, StorageHub, LogAction, LogResult, LogActionId,
-    TransportHub, GenericReplica,
+    ReplicaId, ControlHub, StateMachine, Command, CommandResult, CommandId,
+    ExternalApi, ApiRequest, ApiReply, StorageHub, LogAction, LogResult,
+    LogActionId, TransportHub, GenericReplica,
 };
 use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint};
 use crate::protocols::SmrProtocol;
@@ -39,7 +40,7 @@ pub struct ReplicaConfigCrossword {
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
 
-    /// Path to backing file.
+    /// Path to backing log file.
     pub backer_path: String,
 
     /// Whether to call `fsync()`/`fdatasync()` on logger.
@@ -54,6 +55,13 @@ pub struct ReplicaConfigCrossword {
     /// Interval of leader sending heartbeats to followers.
     pub hb_send_interval_ms: u64,
 
+    /// Path to snapshot file.
+    pub snapshot_path: String,
+
+    /// Snapshot self-triggering interval in secs. 0 means never trigger
+    /// snapshotting autonomously.
+    pub snapshot_interval_s: u64,
+
     /// Fault-tolerance level.
     pub fault_tolerance: u8,
 
@@ -76,9 +84,11 @@ impl Default for ReplicaConfigCrossword {
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.rs_paxos.wal".into(),
             logger_sync: false,
-            hb_hear_timeout_min: 300,
-            hb_hear_timeout_max: 600,
+            hb_hear_timeout_min: 600,
+            hb_hear_timeout_max: 900,
             hb_send_interval_ms: 50,
+            snapshot_path: "/tmp/summerset.rs_paxos.snap".into(),
+            snapshot_interval_s: 0,
             fault_tolerance: 0,
             shards_per_replica: 1,
             perf_storage_a: 0,
@@ -151,6 +161,9 @@ struct Instance {
 
     /// True if from external client, else false.
     external: bool,
+
+    /// Offset of first durable WAL log entry related to this instance.
+    log_offset: usize,
 }
 
 /// Stable storage log entry type.
@@ -170,6 +183,17 @@ enum LogEntry {
     CommitSlot { slot: usize },
 }
 
+/// Snapshot file entry type.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
+enum SnapEntry {
+    /// First entry at the start of file: number of log instances covered by
+    /// this snapshot file == the start slot index of in-mem log.
+    StartSlot { slot: usize },
+
+    /// Set of key-value pairs to apply to the state.
+    KVPairSet { pairs: HashMap<String, String> },
+}
+
 /// Peer-peer message type.
 #[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
 enum PeerMsg {
@@ -198,11 +222,11 @@ enum PeerMsg {
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
 
-    /// Recovery read from new leader to replicas.
-    Recover { slot: usize },
+    /// Reconstruction read from new leader to replicas.
+    Reconstruct { slot: usize },
 
-    /// Recovery read reply from replica to leader.
-    RecoverReply {
+    /// Reconstruction read reply from replica to leader.
+    ReconstructReply {
         slot: usize,
         ballot: Ballot,
         reqs_cw: RSCodeword<ReqBatch>,
@@ -244,6 +268,9 @@ pub struct CrosswordReplica {
     /// StorageHub module.
     storage_hub: StorageHub<LogEntry>,
 
+    /// StorageHub module for the snapshot file.
+    snapshot_hub: StorageHub<SnapEntry>,
+
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
@@ -259,6 +286,12 @@ pub struct CrosswordReplica {
     /// In-memory log of instances.
     insts: Vec<Instance>,
 
+    /// Start slot index of in-mem log after latest snapshot.
+    start_slot: usize,
+
+    /// Timer for taking a new autonomous snapshot.
+    snapshot_interval: Interval,
+
     /// Largest ballot number that a leader has sent Prepare messages in.
     bal_prep_sent: Ballot,
 
@@ -272,12 +305,15 @@ pub struct CrosswordReplica {
     commit_bar: usize,
 
     /// Index of the first non-executed instance.
-    /// It is always true that exec_bar <= commit_bar <= insts.len()
+    /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len()
     exec_bar: usize,
 
     /// Current durable log file offset.
     log_offset: usize,
 
+    /// Current durable snapshot file offset.
+    snap_offset: usize,
+
     /// Fixed Reed-Solomon coder.
     rs_coder: ReedSolomon,
 }
@@ -302,6 +338,7 @@ impl CrosswordReplica {
             leader_bk: None,
             replica_bk: None,
             external: false,
+            log_offset: 0,
         })
     }
 
@@ -446,15 +483,15 @@ impl CrosswordReplica {
 
         // create a new instance in the first null slot (or append a new one
         // at the end if no holes exist)
-        let mut slot = self.insts.len();
-        for s in self.commit_bar..self.insts.len() {
-            if self.insts[s].status == Status::Null {
+        let mut slot = self.start_slot + self.insts.len();
+        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
+            if self.insts[s - self.start_slot].status == Status::Null {
                 slot = s;
                 break;
             }
         }
-        if slot < self.insts.len() {
-            let old_inst = &mut self.insts[slot];
+        if slot < self.start_slot + self.insts.len() {
+            let old_inst = &mut self.insts[slot - self.start_slot];
             assert_eq!(old_inst.status, Status::Null);
             old_inst.reqs_cw = reqs_cw;
             old_inst.leader_bk = Some(LeaderBookkeeping {
@@ -485,7 +522,7 @@ impl CrosswordReplica {
                 self.bal_max_seen = self.bal_prep_sent;
             }
 
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             inst.bal = self.bal_prep_sent;
             inst.status = Status::Preparing;
             pf_debug!(self.id; "enter Prepare phase for slot {} bal {}",
@@ -517,7 +554,7 @@ impl CrosswordReplica {
                                slot, inst.bal);
         } else {
             // normal case: Prepare phase covered, only do the Accept phase
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             inst.bal = self.bal_prepared;
             inst.status = Status::Accepting;
             pf_debug!(self.id; "enter Accept phase for slot {} bal {}",
@@ -588,9 +625,12 @@ impl CrosswordReplica {
         &mut self,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "finished PrepareBal logging for slot {} bal {}",
-                           slot, self.insts[slot].bal);
-        let inst = &self.insts[slot];
+                           slot, self.insts[slot - self.start_slot].bal);
+        let inst = &self.insts[slot - self.start_slot];
         let voted = if inst.voted.0 > 0 {
             Some(inst.voted.clone())
         } else {
@@ -605,18 +645,18 @@ impl CrosswordReplica {
         } else {
             // on follower replica, finishing the logging of a
             // PrepareBal entry leads to sending back a Prepare reply
-            assert!(inst.replica_bk.is_some());
-            let source = inst.replica_bk.as_ref().unwrap().source;
-            self.transport_hub.send_msg(
-                PeerMsg::PrepareReply {
-                    slot,
-                    ballot: inst.bal,
-                    voted,
-                },
-                source,
-            )?;
-            pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}",
-                               source, slot, inst.bal);
+            if let Some(ReplicaBookkeeping { source }) = inst.replica_bk {
+                self.transport_hub.send_msg(
+                    PeerMsg::PrepareReply {
+                        slot,
+                        ballot: inst.bal,
+                        voted,
+                    },
+                    source,
+                )?;
+                pf_trace!(self.id; "sent PrepareReply -> {} for slot {} bal {}",
+                                   source, slot, inst.bal);
+            }
         }
 
         Ok(())
@@ -627,9 +667,12 @@ impl CrosswordReplica {
         &mut self,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "finished AcceptData logging for slot {} bal {}",
-                           slot, self.insts[slot].bal);
-        let inst = &self.insts[slot];
+                           slot, self.insts[slot - self.start_slot].bal);
+        let inst = &self.insts[slot - self.start_slot];
 
         if self.is_leader {
             // on leader, finishing the logging of an AcceptData entry
@@ -639,17 +682,17 @@ impl CrosswordReplica {
         } else {
             // on follower replica, finishing the logging of an
             // AcceptData entry leads to sending back an Accept reply
-            assert!(inst.replica_bk.is_some());
-            let source = inst.replica_bk.as_ref().unwrap().source;
-            self.transport_hub.send_msg(
-                PeerMsg::AcceptReply {
-                    slot,
-                    ballot: inst.bal,
-                },
-                source,
-            )?;
-            pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}",
-                               source, slot, inst.bal);
+            if let Some(ReplicaBookkeeping { source }) = inst.replica_bk {
+                self.transport_hub.send_msg(
+                    PeerMsg::AcceptReply {
+                        slot,
+                        ballot: inst.bal,
+                    },
+                    source,
+                )?;
+                pf_trace!(self.id; "sent AcceptReply -> {} for slot {} bal {}",
+                                   source, slot, inst.bal);
+            }
         }
 
         Ok(())
@@ -660,14 +703,16 @@ impl CrosswordReplica {
         &mut self,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "finished CommitSlot logging for slot {} bal {}",
-                                   slot, self.insts[slot].bal);
-        assert!(self.insts[slot].status >= Status::Committed);
+                                   slot, self.insts[slot - self.start_slot].bal);
 
         // update index of the first non-committed instance
         if slot == self.commit_bar {
-            while self.commit_bar < self.insts.len() {
-                let inst = &mut self.insts[self.commit_bar];
+            while self.commit_bar < self.start_slot + self.insts.len() {
+                let inst = &mut self.insts[self.commit_bar - self.start_slot];
                 if inst.status < Status::Committed {
                     break;
                 }
@@ -716,10 +761,20 @@ impl CrosswordReplica {
         log_result: LogResult<LogEntry>,
     ) -> Result<(), SummersetError> {
         let (slot, entry_type) = Self::split_log_action_id(action_id);
-        assert!(slot < self.insts.len());
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        assert!(slot < self.start_slot + self.insts.len());
 
         if let LogResult::Append { now_size } = log_result {
             assert!(now_size >= self.log_offset);
+            // update first log_offset of slot
+            let inst = &mut self.insts[slot - self.start_slot];
+            if inst.log_offset == 0 || inst.log_offset > self.log_offset {
+                inst.log_offset = self.log_offset;
+            }
+            assert!(inst.log_offset <= self.log_offset);
+            // then update self.log_offset
             self.log_offset = now_size;
         } else {
             return logged_err!(self.id; "unexpected log result type: {:?}", log_result);
@@ -742,16 +797,19 @@ impl CrosswordReplica {
         slot: usize,
         ballot: Ballot,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received Prepare <- {} for slot {} bal {}",
                            peer, slot, ballot);
 
         // if ballot is not smaller than what I have seen:
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
-            while self.insts.len() <= slot {
+            while self.start_slot + self.insts.len() <= slot {
                 self.insts.push(self.null_instance()?);
             }
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             assert!(inst.bal <= ballot);
 
             inst.bal = ballot;
@@ -784,17 +842,23 @@ impl CrosswordReplica {
         ballot: Ballot,
         voted: Option<(Ballot, RSCodeword<ReqBatch>)>,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received PrepareReply <- {} for slot {} bal {} shards {:?}",
                            peer, slot, ballot,
                            voted.as_ref().map(|(_, cw)| cw.avail_shards_map()));
 
         // if ballot is what I'm currently waiting on for Prepare replies:
         if ballot == self.bal_prep_sent {
-            assert!(slot < self.insts.len());
-            let inst = &mut self.insts[slot];
+            assert!(slot < self.start_slot + self.insts.len());
+            let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if (inst.status != Status::Preparing) || (ballot < inst.bal) {
+            if !self.is_leader
+                || (inst.status != Status::Preparing)
+                || (ballot < inst.bal)
+            {
                 return Ok(());
             }
             assert_eq!(inst.bal, ballot);
@@ -913,16 +977,19 @@ impl CrosswordReplica {
         ballot: Ballot,
         reqs_cw: RSCodeword<ReqBatch>,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received Accept <- {} for slot {} bal {} shards {:?}",
                            peer, slot, ballot, reqs_cw.avail_shards_map());
 
         // if ballot is not smaller than what I have made promises for:
         if ballot >= self.bal_max_seen {
             // locate instance in memory, filling in null instances if needed
-            while self.insts.len() <= slot {
+            while self.start_slot + self.insts.len() <= slot {
                 self.insts.push(self.null_instance()?);
             }
-            let inst = &mut self.insts[slot];
+            let inst = &mut self.insts[slot - self.start_slot];
             assert!(inst.bal <= ballot);
 
             inst.bal = ballot;
@@ -960,16 +1027,22 @@ impl CrosswordReplica {
         slot: usize,
         ballot: Ballot,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received AcceptReply <- {} for slot {} bal {}",
                            peer, slot, ballot);
 
         // if ballot is what I'm currently waiting on for Accept replies:
         if ballot == self.bal_prepared {
-            assert!(slot < self.insts.len());
-            let inst = &mut self.insts[slot];
+            assert!(slot < self.start_slot + self.insts.len());
+            let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if (inst.status != Status::Accepting) || (ballot < inst.bal) {
+            if !self.is_leader
+                || (inst.status != Status::Accepting)
+                || (ballot < inst.bal)
+            {
                 return Ok(());
             }
             assert_eq!(inst.bal, ballot);
@@ -1034,13 +1107,16 @@ impl CrosswordReplica {
         peer: ReplicaId,
         slot: usize,
     ) -> Result<(), SummersetError> {
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
         pf_trace!(self.id; "received Commit <- {} for slot {}", peer, slot);
 
         // locate instance in memory, filling in null instances if needed
-        while self.insts.len() <= slot {
+        while self.start_slot + self.insts.len() <= slot {
             self.insts.push(self.null_instance()?);
         }
-        let inst = &mut self.insts[slot];
+        let inst = &mut self.insts[slot - self.start_slot];
 
         // ignore spurious duplications
         if inst.status != Status::Accepting {
@@ -1066,19 +1142,22 @@ impl CrosswordReplica {
         Ok(())
     }
 
-    /// Handler of Recover message from leader.
-    fn handle_msg_recover(
+    /// Handler of Reconstruct message from leader.
+    fn handle_msg_reconstruct(
         &mut self,
         peer: ReplicaId,
         slot: usize,
     ) -> Result<(), SummersetError> {
-        pf_trace!(self.id; "received Recover <- {} for slot {}", peer, slot);
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        pf_trace!(self.id; "received Reconstruct <- {} for slot {}", peer, slot);
 
         // locate instance in memory, filling in null instances if needed
-        while self.insts.len() <= slot {
+        while self.start_slot + self.insts.len() <= slot {
             self.insts.push(self.null_instance()?);
         }
-        let inst = &mut self.insts[slot];
+        let inst = &mut self.insts[slot - self.start_slot];
 
         // ignore spurious duplications; also ignore if I have nothing to send back
         if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 {
@@ -1087,33 +1166,36 @@ impl CrosswordReplica {
 
         // send back my ballot for this slot and the available shards
         self.transport_hub.send_msg(
-            PeerMsg::RecoverReply {
+            PeerMsg::ReconstructReply {
                 slot,
                 ballot: inst.bal,
                 reqs_cw: inst.reqs_cw.clone(),
             },
             peer,
         )?;
-        pf_trace!(self.id; "sent RecoverReply message for slot {} bal {}",
+        pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}",
                            slot, inst.bal);
 
         Ok(())
     }
 
-    /// Handler of Recover reply from replica.
-    fn handle_msg_recover_reply(
+    /// Handler of Reconstruct reply from replica.
+    fn handle_msg_reconstruct_reply(
         &mut self,
         peer: ReplicaId,
         slot: usize,
         ballot: Ballot,
         reqs_cw: RSCodeword<ReqBatch>,
     ) -> Result<(), SummersetError> {
-        pf_trace!(self.id; "received RecoverReply <- {} for slot {} bal {} shards {:?}",
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        pf_trace!(self.id; "received ReconstructReply <- {} for slot {} bal {} shards {:?}",
                            peer, slot, ballot, reqs_cw.avail_shards_map());
-        assert!(slot < self.insts.len());
-        assert!(self.insts[slot].status >= Status::Committed);
-        let num_insts = self.insts.len();
-        let inst = &mut self.insts[slot];
+        assert!(slot < self.start_slot + self.insts.len());
+        assert!(self.insts[slot - self.start_slot].status >= Status::Committed);
+        let num_insts = self.start_slot + self.insts.len();
+        let inst = &mut self.insts[slot - self.start_slot];
 
         // if reply not outdated and ballot is up-to-date
         if inst.status < Status::Executed && ballot >= inst.bal {
@@ -1123,7 +1205,8 @@ impl CrosswordReplica {
             // if enough shards have been gathered, can push execution forward
             if slot == self.commit_bar {
                 while self.commit_bar < num_insts {
-                    let inst = &mut self.insts[self.commit_bar];
+                    let inst =
+                        &mut self.insts[self.commit_bar - self.start_slot];
                     if inst.status < Status::Committed
                         || inst.reqs_cw.avail_shards() < self.quorum_cnt
                     {
@@ -1190,12 +1273,14 @@ impl CrosswordReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
-            PeerMsg::Recover { slot } => self.handle_msg_recover(peer, slot),
-            PeerMsg::RecoverReply {
+            PeerMsg::Reconstruct { slot } => {
+                self.handle_msg_reconstruct(peer, slot)
+            }
+            PeerMsg::ReconstructReply {
                 slot,
                 ballot,
                 reqs_cw,
-            } => self.handle_msg_recover_reply(peer, slot, ballot, reqs_cw),
+            } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw),
             PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot),
         }
     }
@@ -1207,11 +1292,14 @@ impl CrosswordReplica {
         cmd_result: CommandResult,
     ) -> Result<(), SummersetError> {
         let (slot, cmd_idx) = Self::split_command_id(cmd_id);
-        assert!(slot < self.insts.len());
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        assert!(slot < self.start_slot + self.insts.len());
         pf_trace!(self.id; "executed cmd in instance at slot {} idx {}",
                            slot, cmd_idx);
 
-        let inst = &mut self.insts[slot];
+        let inst = &mut self.insts[slot - self.start_slot];
         let reqs = inst.reqs_cw.get_data()?;
         assert!(cmd_idx < reqs.len());
         let (client, ref req) = reqs[cmd_idx];
@@ -1243,8 +1331,8 @@ impl CrosswordReplica {
 
             // update index of the first non-executed instance
             if slot == self.exec_bar {
-                while self.exec_bar < self.insts.len() {
-                    let inst = &mut self.insts[self.exec_bar];
+                while self.exec_bar < self.start_slot + self.insts.len() {
+                    let inst = &mut self.insts[self.exec_bar - self.start_slot];
                     if inst.status < Status::Executed {
                         break;
                     }
@@ -1276,11 +1364,21 @@ impl CrosswordReplica {
         self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
         self.bal_max_seen = self.bal_prep_sent;
 
-        for (slot, inst) in self.insts.iter_mut().enumerate() {
+        for (slot, inst) in self
+            .insts
+            .iter_mut()
+            .enumerate()
+            .map(|(s, i)| (self.start_slot + s, i))
+        {
             // redo Prepare phase for all in-progress instances
             if inst.status < Status::Committed {
                 inst.bal = self.bal_prep_sent;
                 inst.status = Status::Preparing;
+                inst.leader_bk = Some(LeaderBookkeeping {
+                    prepare_acks: Bitmap::new(self.population, false),
+                    prepare_max_bal: 0,
+                    accept_acks: HashMap::new(),
+                });
                 pf_debug!(self.id; "enter Prepare phase for slot {} bal {}",
                                    slot, inst.bal);
 
@@ -1310,14 +1408,14 @@ impl CrosswordReplica {
                                    slot, inst.bal);
             }
 
-            // do recovery reads for all committed instances that do not
+            // do reconstruction reads for all committed instances that do not
             // hold enough available shards for reconstruction
             if inst.status == Status::Committed
                 && inst.reqs_cw.avail_shards() < self.quorum_cnt
             {
                 self.transport_hub
-                    .bcast_msg(PeerMsg::Recover { slot }, None)?;
-                pf_trace!(self.id; "broadcast Recover messages for slot {} bal {} shards {:?}",
+                    .bcast_msg(PeerMsg::Reconstruct { slot }, None)?;
+                pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}",
                                    slot, inst.bal, inst.reqs_cw.avail_shards_map());
             }
         }
@@ -1385,6 +1483,8 @@ impl CrosswordReplica {
         &mut self,
         durable: bool,
     ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got restart req");
+
         // send leave notification to peers and wait for their replies
         self.transport_hub.leave().await?;
 
@@ -1444,6 +1544,19 @@ impl CrosswordReplica {
         Ok(())
     }
 
+    /// Handler of TakeSnapshot control message.
+    async fn handle_ctrl_take_snapshot(
+        &mut self,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server told to take snapshot");
+        self.take_new_snapshot().await?;
+
+        self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
+            new_start: self.start_slot,
+        })?;
+        Ok(())
+    }
+
     /// Synthesized handler of manager control messages. If ok, returns
     /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
     /// decides to shutdown completely, and `None` if not terminating.
@@ -1468,6 +1581,11 @@ impl CrosswordReplica {
                 Ok(None)
             }
 
+            CtrlMsg::TakeSnapshot => {
+                self.handle_ctrl_take_snapshot().await?;
+                Ok(None)
+            }
+
             _ => Ok(None), // ignore all other types
         }
     }
@@ -1480,11 +1598,11 @@ impl CrosswordReplica {
         match entry {
             LogEntry::PrepareBal { slot, ballot } => {
                 // locate instance in memory, filling in null instances if needed
-                while self.insts.len() <= slot {
+                while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance()?);
                 }
                 // update instance state
-                let inst = &mut self.insts[slot];
+                let inst = &mut self.insts[slot - self.start_slot];
                 inst.bal = ballot;
                 inst.status = Status::Preparing;
                 // update bal_prep_sent and bal_max_seen, reset bal_prepared
@@ -1503,15 +1621,20 @@ impl CrosswordReplica {
                 reqs_cw,
             } => {
                 // locate instance in memory, filling in null instances if needed
-                while self.insts.len() <= slot {
+                while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance()?);
                 }
                 // update instance state
-                let inst = &mut self.insts[slot];
+                let inst = &mut self.insts[slot - self.start_slot];
                 inst.bal = ballot;
                 inst.status = Status::Accepting;
                 inst.reqs_cw = reqs_cw.clone();
                 inst.voted = (ballot, reqs_cw);
+                // it could be the case that the PrepareBal action for this
+                // ballot has been snapshotted
+                if self.bal_prep_sent < ballot {
+                    self.bal_prep_sent = ballot;
+                }
                 // update bal_prepared and bal_max_seen
                 if self.bal_prepared < ballot {
                     self.bal_prepared = ballot;
@@ -1523,14 +1646,15 @@ impl CrosswordReplica {
             }
 
             LogEntry::CommitSlot { slot } => {
-                assert!(slot < self.insts.len());
+                assert!(slot < self.start_slot + self.insts.len());
                 // update instance state
-                self.insts[slot].status = Status::Committed;
+                self.insts[slot - self.start_slot].status = Status::Committed;
                 // submit commands in contiguously committed instance to the
                 // state machine
                 if slot == self.commit_bar {
-                    while self.commit_bar < self.insts.len() {
-                        let inst = &mut self.insts[self.commit_bar];
+                    while self.commit_bar < self.start_slot + self.insts.len() {
+                        let inst =
+                            &mut self.insts[self.commit_bar - self.start_slot];
                         if inst.status < Status::Committed {
                             break;
                         }
@@ -1614,6 +1738,221 @@ impl CrosswordReplica {
             logged_err!(self.id; "unexpected log result type or failed truncate")
         }
     }
+
+    /// Dump a new key-value pair to snapshot file.
+    async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
+        // collect all key-value pairs put up to exec_bar
+        let mut pairs = HashMap::new();
+        for slot in self.start_slot..self.exec_bar {
+            let inst = &mut self.insts[slot - self.start_slot];
+            assert!(inst.reqs_cw.avail_data_shards() >= self.quorum_cnt);
+            for (_, req) in inst.reqs_cw.get_data()?.clone() {
+                if let ApiRequest::Req {
+                    cmd: Command::Put { key, value },
+                    ..
+                } = req
+                {
+                    pairs.insert(key, value);
+                }
+            }
+        }
+
+        // write the collection to snapshot file
+        self.snapshot_hub.submit_action(
+            0, // using 0 as dummy log action ID
+            LogAction::Append {
+                entry: SnapEntry::KVPairSet { pairs },
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        if let LogResult::Append { now_size } = log_result {
+            self.snap_offset = now_size;
+            Ok(())
+        } else {
+            logged_err!(
+                self.id;
+                "unexpected log result type"
+            )
+        }
+    }
+
+    /// Discard everything older than start_slot in durable WAL log.
+    async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> {
+        let cut_offset = if !self.insts.is_empty() {
+            self.insts[0].log_offset
+        } else {
+            self.log_offset
+        };
+
+        // discard the log before cut_offset
+        if cut_offset > 0 {
+            self.storage_hub
+                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id != 0 {
+                    // normal log action previously in queue; process it
+                    self.handle_log_result(action_id, log_result)?;
+                } else {
+                    if let LogResult::Discard {
+                        offset_ok: true,
+                        now_size,
+                    } = log_result
+                    {
+                        assert_eq!(self.log_offset - cut_offset, now_size);
+                        self.log_offset = now_size;
+                    } else {
+                        return logged_err!(
+                            self.id;
+                            "unexpected log result type or failed discard"
+                        );
+                    }
+                    break;
+                }
+            }
+        }
+
+        // update inst.log_offset for all remaining in-mem instances
+        for inst in &mut self.insts {
+            if inst.log_offset > 0 {
+                assert!(inst.log_offset >= cut_offset);
+                inst.log_offset -= cut_offset;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Take a snapshot up to current exec_idx, then discard the in-mem log up
+    /// to that index as well as outdate entries in the durable WAL log file.
+    ///
+    /// NOTE: the current implementation does not guard against crashes in the
+    /// middle of taking a snapshot.
+    async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
+        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
+                           self.start_slot, self.exec_bar);
+        assert!(self.exec_bar >= self.start_slot);
+        if self.exec_bar == self.start_slot {
+            return Ok(());
+        }
+
+        // collect and dump all Puts in executed instances
+        if self.is_leader {
+            // NOTE: broadcast heartbeats here to appease followers
+            self.bcast_heartbeats()?;
+        }
+        self.snapshot_dump_kv_pairs().await?;
+
+        // update start_slot and discard all in-memory log instances up to exec_bar
+        self.insts.drain(0..(self.exec_bar - self.start_slot));
+        self.start_slot = self.exec_bar;
+
+        // discarding everything older than start_slot in WAL log
+        if self.is_leader {
+            // NOTE: broadcast heartbeats here to appease followers
+            self.bcast_heartbeats()?;
+        }
+        self.snapshot_discard_log().await?;
+
+        // reset the leader heartbeat hear timer
+        self.kickoff_hb_hear_timer()?;
+
+        pf_info!(self.id; "took snapshot up to: start {}", self.start_slot);
+        Ok(())
+    }
+
+    /// Recover initial state from durable storage snapshot file.
+    async fn recover_from_snapshot(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.snap_offset, 0);
+
+        // first, try to read the first several bytes, which should record the
+        // start_slot index
+        self.snapshot_hub
+            .submit_action(0, LogAction::Read { offset: 0 })?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+
+        match log_result {
+            LogResult::Read {
+                entry: Some(SnapEntry::StartSlot { slot }),
+                end_offset,
+            } => {
+                self.snap_offset = end_offset;
+                self.start_slot = slot; // get start slot index of in-mem log
+
+                // repeatedly apply key-value pairs
+                loop {
+                    self.snapshot_hub.submit_action(
+                        0,
+                        LogAction::Read {
+                            offset: self.snap_offset,
+                        },
+                    )?;
+                    let (_, log_result) =
+                        self.snapshot_hub.get_result().await?;
+
+                    match log_result {
+                        LogResult::Read {
+                            entry: Some(SnapEntry::KVPairSet { pairs }),
+                            end_offset,
+                        } => {
+                            // execute Put commands on state machine
+                            for (key, value) in pairs {
+                                self.state_machine.submit_cmd(
+                                    0,
+                                    Command::Put { key, value },
+                                )?;
+                                let _ = self.state_machine.get_result().await?;
+                            }
+                            // update snapshot file offset
+                            self.snap_offset = end_offset;
+                        }
+                        LogResult::Read { entry: None, .. } => {
+                            // end of log reached
+                            break;
+                        }
+                        _ => {
+                            return logged_err!(self.id; "unexpected log result type");
+                        }
+                    }
+                }
+
+                // tell manager about my start_slot index
+                self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
+                    new_start: self.start_slot,
+                })?;
+                Ok(())
+            }
+
+            LogResult::Read { entry: None, .. } => {
+                // snapshot file is empty. Write a 0 as start_slot and return
+                self.snapshot_hub.submit_action(
+                    0,
+                    LogAction::Write {
+                        entry: SnapEntry::StartSlot { slot: 0 },
+                        offset: 0,
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                let (_, log_result) = self.snapshot_hub.get_result().await?;
+                if let LogResult::Write {
+                    offset_ok: true,
+                    now_size,
+                } = log_result
+                {
+                    self.snap_offset = now_size;
+                    Ok(())
+                } else {
+                    logged_err!(self.id; "unexpected log result type or failed truncate")
+                }
+            }
+
+            _ => {
+                logged_err!(self.id; "unexpected log result type")
+            }
+        }
+    }
 }
 
 #[async_trait]
@@ -1635,6 +1974,7 @@ impl GenericReplica for CrosswordReplica {
                                     backer_path, logger_sync,
                                     hb_hear_timeout_min, hb_hear_timeout_max,
                                     hb_send_interval_ms,
+                                    snapshot_path, snapshot_interval_s,
                                     fault_tolerance, shards_per_replica,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
@@ -1737,6 +2077,14 @@ impl GenericReplica for CrosswordReplica {
         }
         transport_hub.wait_for_group(population).await?;
 
+        // setup snapshot hub module
+        let snapshot_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.snapshot_path),
+            None,
+        )
+        .await?;
+
         // setup external API module, ready to take in client requests
         let external_api = ExternalApi::new_and_setup(
             id,
@@ -1750,6 +2098,15 @@ impl GenericReplica for CrosswordReplica {
             time::interval(Duration::from_millis(config.hb_send_interval_ms));
         hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
+        let mut snapshot_interval = time::interval(Duration::from_secs(
+            if config.snapshot_interval_s > 0 {
+                config.snapshot_interval_s
+            } else {
+                60 // dummy non-zero value to make `time::interval` happy
+            },
+        ));
+        snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
         Ok(CrosswordReplica {
             id,
             population,
@@ -1761,17 +2118,21 @@ impl GenericReplica for CrosswordReplica {
             external_api,
             state_machine,
             storage_hub,
+            snapshot_hub,
             transport_hub,
             hb_hear_timer: Timer::new(),
             hb_send_interval,
             is_leader: false,
             insts: vec![],
+            start_slot: 0,
+            snapshot_interval,
             bal_prep_sent: 0,
             bal_prepared: 0,
             bal_max_seen: 0,
             commit_bar: 0,
             exec_bar: 0,
             log_offset: 0,
+            snap_offset: 0,
             rs_coder,
         })
     }
@@ -1780,7 +2141,10 @@ impl GenericReplica for CrosswordReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
-        // recover state from durable storage log
+        // recover state from durable snapshot file
+        self.recover_from_snapshot().await?;
+
+        // recover the tail-piece memory log & state from durable storage log
         self.recover_from_log().await?;
 
         // kick off leader activity hearing timer
@@ -1853,6 +2217,18 @@ impl GenericReplica for CrosswordReplica {
                     }
                 },
 
+                // autonomous snapshot taking timeout
+                _ = self.snapshot_interval.tick(), if !paused
+                        && self.config.snapshot_interval_s > 0 => {
+                    if let Err(e) = self.take_new_snapshot().await {
+                        pf_error!(self.id; "error taking a new snapshot: {}", e);
+                    } else {
+                        self.control_hub.send_ctrl(
+                            CtrlMsg::SnapshotUpTo { new_start: self.start_slot }
+                        )?;
+                    }
+                },
+
                 // manager control message
                 ctrl_msg = self.control_hub.recv_ctrl() => {
                     if let Err(e) = ctrl_msg {
@@ -1863,10 +2239,6 @@ impl GenericReplica for CrosswordReplica {
                     match self.handle_ctrl_msg(ctrl_msg, &mut paused).await {
                         Ok(terminate) => {
                             if let Some(restart) = terminate {
-                                pf_warn!(
-                                    self.id;
-                                    "server got {} req",
-                                    if restart { "restart" } else { "shutdown" });
                                 return Ok(restart);
                             }
                         },

From 70fb71f0853e2e0feacd9ce109ed945181534bdc Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 24 Sep 2023 23:16:17 -0500
Subject: [PATCH 59/89] finish implementation of snapshotting

---
 src/manager/clusman.rs      |  2 +-
 src/protocols/crossword.rs  | 14 +++++++++-----
 src/protocols/multipaxos.rs | 14 +++++++++-----
 src/protocols/rs_paxos.rs   | 14 +++++++++-----
 src/server/transport.rs     | 29 ++++++++++++++++-------------
 5 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index f1a496c9..a21ef9c7 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -360,7 +360,7 @@ impl ClusterManager {
             }
 
             // wait a while to ensure the server's transport hub is setup
-            time::sleep(Duration::from_millis(300)).await;
+            time::sleep(Duration::from_millis(500)).await;
 
             reset_done.insert(s);
         }
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index f8dda64d..3964c9bf 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -233,7 +233,7 @@ enum PeerMsg {
     },
 
     /// Leader activity heartbeat.
-    Heartbeat { ballot: Ballot },
+    Heartbeat { ballot: Ballot, exec_bar: usize },
 }
 
 /// Crossword server replica module.
@@ -1281,7 +1281,9 @@ impl CrosswordReplica {
                 ballot,
                 reqs_cw,
             } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw),
-            PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot),
+            PeerMsg::Heartbeat { ballot, exec_bar } => {
+                self.heard_heartbeat(peer, ballot, exec_bar)
+            }
         }
     }
 
@@ -1428,10 +1430,11 @@ impl CrosswordReplica {
         self.transport_hub.bcast_msg(
             PeerMsg::Heartbeat {
                 ballot: self.bal_prep_sent,
+                exec_bar: self.exec_bar,
             },
             None,
         )?;
-        self.heard_heartbeat(self.id, self.bal_prep_sent)?;
+        self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
 
         // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
         Ok(())
@@ -1457,9 +1460,10 @@ impl CrosswordReplica {
         &mut self,
         _peer: ReplicaId,
         ballot: Ballot,
+        exec_bar: usize,
     ) -> Result<(), SummersetError> {
-        // ignore outdated hearbeat
-        if ballot < self.bal_max_seen {
+        // ignore outdated heartbeats and those from peers with exec_bar < mine
+        if ballot < self.bal_max_seen || exec_bar < self.exec_bar {
             return Ok(());
         }
 
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index d9d5cd76..00e5f964 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -214,7 +214,7 @@ enum PeerMsg {
     Commit { slot: usize },
 
     /// Leader activity heartbeat.
-    Heartbeat { ballot: Ballot },
+    Heartbeat { ballot: Ballot, exec_bar: usize },
 }
 
 /// MultiPaxos server replica module.
@@ -963,7 +963,9 @@ impl MultiPaxosReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
-            PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot),
+            PeerMsg::Heartbeat { ballot, exec_bar } => {
+                self.heard_heartbeat(peer, ballot, exec_bar)
+            }
         }
     }
 
@@ -1098,10 +1100,11 @@ impl MultiPaxosReplica {
         self.transport_hub.bcast_msg(
             PeerMsg::Heartbeat {
                 ballot: self.bal_prep_sent,
+                exec_bar: self.exec_bar,
             },
             None,
         )?;
-        self.heard_heartbeat(self.id, self.bal_prep_sent)?;
+        self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
 
         // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
         Ok(())
@@ -1127,9 +1130,10 @@ impl MultiPaxosReplica {
         &mut self,
         _peer: ReplicaId,
         ballot: Ballot,
+        exec_bar: usize,
     ) -> Result<(), SummersetError> {
-        // ignore outdated hearbeat
-        if ballot < self.bal_max_seen {
+        // ignore outdated heartbeats and those from peers with exec_bar < mine
+        if ballot < self.bal_max_seen || exec_bar < self.exec_bar {
             return Ok(());
         }
 
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 4aa5a81f..82990870 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -226,7 +226,7 @@ enum PeerMsg {
     },
 
     /// Leader activity heartbeat.
-    Heartbeat { ballot: Ballot },
+    Heartbeat { ballot: Ballot, exec_bar: usize },
 }
 
 /// RSPaxos server replica module.
@@ -1180,7 +1180,9 @@ impl RSPaxosReplica {
                 ballot,
                 reqs_cw,
             } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw),
-            PeerMsg::Heartbeat { ballot } => self.heard_heartbeat(peer, ballot),
+            PeerMsg::Heartbeat { ballot, exec_bar } => {
+                self.heard_heartbeat(peer, ballot, exec_bar)
+            }
         }
     }
 
@@ -1327,10 +1329,11 @@ impl RSPaxosReplica {
         self.transport_hub.bcast_msg(
             PeerMsg::Heartbeat {
                 ballot: self.bal_prep_sent,
+                exec_bar: self.exec_bar,
             },
             None,
         )?;
-        self.heard_heartbeat(self.id, self.bal_prep_sent)?;
+        self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
 
         // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
         Ok(())
@@ -1356,9 +1359,10 @@ impl RSPaxosReplica {
         &mut self,
         _peer: ReplicaId,
         ballot: Ballot,
+        exec_bar: usize,
     ) -> Result<(), SummersetError> {
-        // ignore outdated hearbeat
-        if ballot < self.bal_max_seen {
+        // ignore outdated heartbeats and those from peers with exec_bar < mine
+        if ballot < self.bal_max_seen || exec_bar < self.exec_bar {
             return Ok(());
         }
 
diff --git a/src/server/transport.rs b/src/server/transport.rs
index 699697d8..ba0e1e8b 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -295,8 +295,9 @@ where
 
     /// Broadcasts leave notifications to all peers and waits for replies.
     pub async fn leave(&mut self) -> Result<(), SummersetError> {
-        let tx_sends_guard = self.tx_sends.guard();
+        #[allow(unused_variables)]
         let mut num_peers = 0;
+        let tx_sends_guard = self.tx_sends.guard();
         for &peer in tx_sends_guard.keys() {
             if peer == self.me {
                 continue;
@@ -311,18 +312,20 @@ where
             num_peers += 1;
         }
 
-        let mut replies = Bitmap::new(self.population, false);
-        while replies.count() < num_peers {
-            match self.rx_recv.recv().await {
-                Some((id, peer_msg)) => match peer_msg {
-                    PeerMessage::LeaveReply => replies.set(id, true)?,
-                    _ => continue, // ignore all other types of messages
-                },
-                None => {
-                    return logged_err!(self.me; "recv channel has been closed");
-                }
-            }
-        }
+        // NOTE: commenting out the following to avoid rare blocking during
+        // tester resets
+        // let mut replies = Bitmap::new(self.population, false);
+        // while replies.count() < num_peers {
+        //     match self.rx_recv.recv().await {
+        //         Some((id, peer_msg)) => match peer_msg {
+        //             PeerMessage::LeaveReply => replies.set(id, true)?,
+        //             _ => continue, // ignore all other types of messages
+        //         },
+        //         None => {
+        //             return logged_err!(self.me; "recv channel has been closed");
+        //         }
+        //     }
+        // }
 
         Ok(())
     }

From dee1ef06e1ff872eb67391dac9b9d3c3ca0faa37 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Mon, 25 Sep 2023 12:49:01 -0500
Subject: [PATCH 60/89] exclude unwanted shards in Reconstruct

---
 src/protocols/crossword.rs | 39 +++++++++++++++++++++++++++++++-------
 src/utils/bitmap.rs        | 14 ++++++++++++++
 2 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 3964c9bf..0cd89f2e 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -223,7 +223,7 @@ enum PeerMsg {
     Commit { slot: usize },
 
     /// Reconstruction read from new leader to replicas.
-    Reconstruct { slot: usize },
+    Reconstruct { slot: usize, exclude: Vec<u8> },
 
     /// Reconstruction read reply from replica to leader.
     ReconstructReply {
@@ -1147,6 +1147,7 @@ impl CrosswordReplica {
         &mut self,
         peer: ReplicaId,
         slot: usize,
+        exclude: Vec<u8>,
     ) -> Result<(), SummersetError> {
         if slot < self.start_slot {
             return Ok(()); // ignore if slot index outdated
@@ -1160,7 +1161,13 @@ impl CrosswordReplica {
         let inst = &mut self.insts[slot - self.start_slot];
 
         // ignore spurious duplications; also ignore if I have nothing to send back
-        if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 {
+        if inst.status < Status::Accepting {
+            return Ok(());
+        }
+        let mut subset = Bitmap::from(inst.reqs_cw.num_shards(), exclude);
+        subset.flip(); // exclude unwanted shards the sender already has
+        let reply_cw = inst.reqs_cw.subset_copy(subset, false)?;
+        if reply_cw.avail_shards() == 0 {
             return Ok(());
         }
 
@@ -1169,7 +1176,7 @@ impl CrosswordReplica {
             PeerMsg::ReconstructReply {
                 slot,
                 ballot: inst.bal,
-                reqs_cw: inst.reqs_cw.clone(),
+                reqs_cw: reply_cw.clone(),
             },
             peer,
         )?;
@@ -1273,8 +1280,8 @@ impl CrosswordReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
-            PeerMsg::Reconstruct { slot } => {
-                self.handle_msg_reconstruct(peer, slot)
+            PeerMsg::Reconstruct { slot, exclude } => {
+                self.handle_msg_reconstruct(peer, slot, exclude)
             }
             PeerMsg::ReconstructReply {
                 slot,
@@ -1415,8 +1422,26 @@ impl CrosswordReplica {
             if inst.status == Status::Committed
                 && inst.reqs_cw.avail_shards() < self.quorum_cnt
             {
-                self.transport_hub
-                    .bcast_msg(PeerMsg::Reconstruct { slot }, None)?;
+                self.transport_hub.bcast_msg(
+                    PeerMsg::Reconstruct {
+                        slot,
+                        exclude: inst
+                            .reqs_cw
+                            .avail_shards_map()
+                            .iter()
+                            .filter_map(
+                                |(idx, flag)| {
+                                    if flag {
+                                        Some(idx)
+                                    } else {
+                                        None
+                                    }
+                                },
+                            )
+                            .collect(),
+                    },
+                    None,
+                )?;
                 pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}",
                                    slot, inst.bal, inst.reqs_cw.avail_shards_map());
             }
diff --git a/src/utils/bitmap.rs b/src/utils/bitmap.rs
index d5fe9c8e..592ddb0c 100644
--- a/src/utils/bitmap.rs
+++ b/src/utils/bitmap.rs
@@ -71,6 +71,12 @@ impl Bitmap {
         self.0.count_ones(..) as u8
     }
 
+    /// Flips all flags in the bitmap.
+    #[inline]
+    pub fn flip(&mut self) {
+        self.0.toggle_range(..)
+    }
+
     /// Allows `for (id, bit) in map.iter()`.
     #[inline]
     pub fn iter(&self) -> BitmapIter {
@@ -143,6 +149,14 @@ mod bitmap_tests {
         assert!(map.get(7).is_err());
     }
 
+    #[test]
+    fn bitmap_flip() {
+        let mut map = Bitmap::new(5, false);
+        assert!(map.set(1, true).is_ok());
+        map.flip();
+        assert_eq!(map, Bitmap::from(5, vec![0, 2, 3, 4]));
+    }
+
     #[test]
     fn bitmap_count() {
         let mut map = Bitmap::new(7, false);

From 9fce9cb6daa7876e0308a32a5f66bb3fb3850233 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Mon, 25 Sep 2023 15:01:43 -0500
Subject: [PATCH 61/89] add primitive follower gossiping

---
 src/protocols/crossword.rs   | 122 ++++++++++++++++++++++++++++++++++-
 src/protocols/multipaxos.rs  |  26 +++++++-
 src/protocols/rep_nothing.rs |  16 +++++
 src/protocols/rs_paxos.rs    |  26 +++++++-
 src/protocols/simple_push.rs |  19 ++++++
 5 files changed, 204 insertions(+), 5 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 0cd89f2e..95f044c1 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -48,7 +48,6 @@ pub struct ReplicaConfigCrossword {
 
     /// Min timeout of not hearing any heartbeat from leader in millisecs.
     pub hb_hear_timeout_min: u64,
-
     /// Max timeout of not hearing any heartbeat from leader in millisecs.
     pub hb_hear_timeout_max: u64,
 
@@ -62,6 +61,11 @@ pub struct ReplicaConfigCrossword {
     /// snapshotting autonomously.
     pub snapshot_interval_s: u64,
 
+    /// Min timeout of follower gossiping trigger in millisecs.
+    pub gossip_timeout_min: u64,
+    /// Max timeout of follower gossiping trigger in millisecs.
+    pub gossip_timeout_max: u64,
+
     /// Fault-tolerance level.
     pub fault_tolerance: u8,
 
@@ -89,6 +93,8 @@ impl Default for ReplicaConfigCrossword {
             hb_send_interval_ms: 50,
             snapshot_path: "/tmp/summerset.rs_paxos.snap".into(),
             snapshot_interval_s: 0,
+            gossip_timeout_min: 100,
+            gossip_timeout_max: 300,
             fault_tolerance: 0,
             shards_per_replica: 1,
             perf_storage_a: 0,
@@ -292,6 +298,9 @@ pub struct CrosswordReplica {
     /// Timer for taking a new autonomous snapshot.
     snapshot_interval: Interval,
 
+    /// Titer for trigger follower gossiping.
+    gossip_timer: Timer,
+
     /// Largest ballot number that a leader has sent Prepare messages in.
     bal_prep_sent: Ballot,
 
@@ -318,6 +327,7 @@ pub struct CrosswordReplica {
     rs_coder: ReedSolomon,
 }
 
+// CrosswordReplica common helpers
 impl CrosswordReplica {
     /// Create an empty null instance.
     fn null_instance(&self) -> Result<Instance, SummersetError> {
@@ -442,7 +452,10 @@ impl CrosswordReplica {
 
         min_coverage
     }
+}
 
+// CrosswordReplica client requests entrance
+impl CrosswordReplica {
     /// Handler of client request batch chan recv.
     fn handle_req_batch(
         &mut self,
@@ -619,7 +632,10 @@ impl CrosswordReplica {
 
         Ok(())
     }
+}
 
+// CrosswordReplica durable WAL logging
+impl CrosswordReplica {
     /// Handler of PrepareBal logging result chan recv.
     fn handle_logged_prepare_bal(
         &mut self,
@@ -789,7 +805,10 @@ impl CrosswordReplica {
             }
         }
     }
+}
 
+// CrosswordReplica peer-peer messages handling
+impl CrosswordReplica {
     /// Handler of Prepare message from leader.
     fn handle_msg_prepare(
         &mut self,
@@ -1293,7 +1312,10 @@ impl CrosswordReplica {
             }
         }
     }
+}
 
+// CrosswordReplica state machine execution
+impl CrosswordReplica {
     /// Handler of state machine exec result chan recv.
     fn handle_cmd_result(
         &mut self,
@@ -1352,7 +1374,10 @@ impl CrosswordReplica {
 
         Ok(())
     }
+}
 
+// CrosswordReplica leadership related logic
+impl CrosswordReplica {
     /// Becomes a leader, sends self-initiated Prepare messages to followers
     /// for all in-progress instances, and starts broadcasting heartbeats.
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
@@ -1506,7 +1531,80 @@ impl CrosswordReplica {
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
         Ok(())
     }
+}
+
+// CrosswordReplica follower gossiping
+impl CrosswordReplica {
+    /// Chooses a random gossip_timeout from the min-max range and kicks off
+    /// the gossip_timer.
+    fn kickoff_gossip_timer(&mut self) -> Result<(), SummersetError> {
+        let timeout_ms = thread_rng().gen_range(
+            self.config.gossip_timeout_min..=self.config.gossip_timeout_max,
+        );
+
+        // pf_trace!(self.id; "kickoff gossip_timer @ {} ms", timeout_ms);
+        self.gossip_timer
+            .kickoff(Duration::from_millis(timeout_ms))?;
+        Ok(())
+    }
 
+    /// Triggers gossiping for my missing shards in committed but not-yet-
+    /// executed instances: fetch missing shards from peers, preferring
+    /// follower peers that hold data shards.
+    fn trigger_gossiping(&mut self) -> Result<(), SummersetError> {
+        // TODO: want cleverer design than this!
+        let mut slot_up_to = self.exec_bar;
+        for slot in self.exec_bar..(self.start_slot + self.insts.len()) {
+            slot_up_to = slot;
+            let inst = &self.insts[slot - self.start_slot];
+            if inst.status >= Status::Executed {
+                continue;
+            } else if inst.status < Status::Committed {
+                break;
+            }
+
+            if inst.reqs_cw.avail_shards() < self.quorum_cnt {
+                let mut target = Bitmap::new(self.population, true);
+                if let Some(ReplicaBookkeeping { source }) = inst.replica_bk {
+                    // skip leader who initially replicated this instance to me
+                    target.set(source, false)?;
+                }
+                self.transport_hub.bcast_msg(
+                    PeerMsg::Reconstruct {
+                        slot,
+                        exclude: inst
+                            .reqs_cw
+                            .avail_shards_map()
+                            .iter()
+                            .filter_map(
+                                |(idx, flag)| {
+                                    if flag {
+                                        Some(idx)
+                                    } else {
+                                        None
+                                    }
+                                },
+                            )
+                            .collect(),
+                    },
+                    Some(target),
+                )?;
+            }
+        }
+
+        // reset gossip trigger timer
+        self.kickoff_gossip_timer()?;
+
+        if slot_up_to > self.exec_bar {
+            pf_debug!(self.id; "triggered gossiping: slots {} - {}",
+                                   self.exec_bar, slot_up_to);
+        }
+        Ok(())
+    }
+}
+
+// CrosswordReplica control messages handling
+impl CrosswordReplica {
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -1618,7 +1716,10 @@ impl CrosswordReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+}
 
+// CrosswordReplica recovery from WAL log
+impl CrosswordReplica {
     /// Apply a durable storage log entry for recovery.
     async fn recover_apply_entry(
         &mut self,
@@ -1767,7 +1868,10 @@ impl CrosswordReplica {
             logged_err!(self.id; "unexpected log result type or failed truncate")
         }
     }
+}
 
+// CrosswordReplica snapshotting & GC logic
+impl CrosswordReplica {
     /// Dump a new key-value pair to snapshot file.
     async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
@@ -1860,12 +1964,12 @@ impl CrosswordReplica {
     /// NOTE: the current implementation does not guard against crashes in the
     /// middle of taking a snapshot.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
-        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
-                           self.start_slot, self.exec_bar);
         assert!(self.exec_bar >= self.start_slot);
         if self.exec_bar == self.start_slot {
             return Ok(());
         }
+        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
+                           self.start_slot, self.exec_bar);
 
         // collect and dump all Puts in executed instances
         if self.is_leader {
@@ -2004,6 +2108,7 @@ impl GenericReplica for CrosswordReplica {
                                     hb_hear_timeout_min, hb_hear_timeout_max,
                                     hb_send_interval_ms,
                                     snapshot_path, snapshot_interval_s,
+                                    gossip_timeout_min, gossip_timeout_max,
                                     fault_tolerance, shards_per_replica,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
@@ -2155,6 +2260,7 @@ impl GenericReplica for CrosswordReplica {
             insts: vec![],
             start_slot: 0,
             snapshot_interval,
+            gossip_timer: Timer::new(),
             bal_prep_sent: 0,
             bal_prepared: 0,
             bal_max_seen: 0,
@@ -2179,6 +2285,9 @@ impl GenericReplica for CrosswordReplica {
         // kick off leader activity hearing timer
         self.kickoff_hb_hear_timer()?;
 
+        // kick off follower gossiping trigger timer
+        self.kickoff_gossip_timer()?;
+
         // main event loop
         let mut paused = false;
         loop {
@@ -2258,6 +2367,13 @@ impl GenericReplica for CrosswordReplica {
                     }
                 },
 
+                // follower gossiping trigger
+                _ = self.gossip_timer.timeout(), if !paused && !self.is_leader => {
+                    if let Err(e) = self.trigger_gossiping() {
+                        pf_error!(self.id; "error triggering gossiping: {}", e);
+                    }
+                },
+
                 // manager control message
                 ctrl_msg = self.control_hub.recv_ctrl() => {
                     if let Err(e) = ctrl_msg {
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 00e5f964..4783f4f7 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -49,7 +49,6 @@ pub struct ReplicaConfigMultiPaxos {
 
     /// Min timeout of not hearing any heartbeat from leader in millisecs.
     pub hb_hear_timeout_min: u64,
-
     /// Max timeout of not hearing any heartbeat from leader in millisecs.
     pub hb_hear_timeout_max: u64,
 
@@ -296,6 +295,7 @@ pub struct MultiPaxosReplica {
     snap_offset: usize,
 }
 
+// MultiPaxosReplica common helpers
 impl MultiPaxosReplica {
     /// Create an empty null instance.
     fn null_instance(&self) -> Instance {
@@ -359,7 +359,10 @@ impl MultiPaxosReplica {
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
         (slot, cmd_idx)
     }
+}
 
+// MultiPaxosReplica client requests entrance
+impl MultiPaxosReplica {
     /// Handler of client request batch chan recv.
     fn handle_req_batch(
         &mut self,
@@ -498,7 +501,10 @@ impl MultiPaxosReplica {
 
         Ok(())
     }
+}
 
+// MultiPaxosReplica durable WAL logging
+impl MultiPaxosReplica {
     /// Handler of PrepareBal logging result chan recv.
     fn handle_logged_prepare_bal(
         &mut self,
@@ -657,7 +663,10 @@ impl MultiPaxosReplica {
             }
         }
     }
+}
 
+// MultiPaxosReplica peer-peer messages handling
+impl MultiPaxosReplica {
     /// Handler of Prepare message from leader.
     fn handle_msg_prepare(
         &mut self,
@@ -968,7 +977,10 @@ impl MultiPaxosReplica {
             }
         }
     }
+}
 
+// MultiPaxosReplica state machine execution
+impl MultiPaxosReplica {
     /// Handler of state machine exec result chan recv.
     fn handle_cmd_result(
         &mut self,
@@ -1026,7 +1038,10 @@ impl MultiPaxosReplica {
 
         Ok(())
     }
+}
 
+// MultiPaxosReplica leadership related logic
+impl MultiPaxosReplica {
     /// Becomes a leader, sends self-initiated Prepare messages to followers
     /// for all in-progress instances, and starts broadcasting heartbeats.
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
@@ -1151,7 +1166,10 @@ impl MultiPaxosReplica {
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
         Ok(())
     }
+}
 
+// MultiPaxosReplica control messages handling
+impl MultiPaxosReplica {
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -1263,7 +1281,10 @@ impl MultiPaxosReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+}
 
+// MultiPaxosReplica recovery from WAL log
+impl MultiPaxosReplica {
     /// Apply a durable storage log entry for recovery.
     async fn recover_apply_entry(
         &mut self,
@@ -1397,7 +1418,10 @@ impl MultiPaxosReplica {
             logged_err!(self.id; "unexpected log result type or failed truncate")
         }
     }
+}
 
+// MultiPaxosReplica snapshotting & GC logic
+impl MultiPaxosReplica {
     /// Dump a new key-value pair to snapshot file.
     async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index e5c6b0dd..74ea31a7 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -106,6 +106,7 @@ pub struct RepNothingReplica {
     log_offset: usize,
 }
 
+// RepNothingReplica common helpers
 impl RepNothingReplica {
     /// Compose CommandId from instance index & command index within.
     fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId {
@@ -120,7 +121,10 @@ impl RepNothingReplica {
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
         (inst_idx, cmd_idx)
     }
+}
 
+// RepNothingReplica client requests entrance
+impl RepNothingReplica {
     /// Handler of client request batch chan recv.
     fn handle_req_batch(
         &mut self,
@@ -149,7 +153,10 @@ impl RepNothingReplica {
 
         Ok(())
     }
+}
 
+// RepNothingReplica durable WAL logging
+impl RepNothingReplica {
     /// Handler of durable logging result chan recv.
     fn handle_log_result(
         &mut self,
@@ -190,7 +197,10 @@ impl RepNothingReplica {
 
         Ok(())
     }
+}
 
+// RepNothingReplica state machine execution
+impl RepNothingReplica {
     /// Handler of state machine exec result chan recv.
     fn handle_cmd_result(
         &mut self,
@@ -236,7 +246,10 @@ impl RepNothingReplica {
 
         Ok(())
     }
+}
 
+// RepNothingReplica control messages handling
+impl RepNothingReplica {
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -321,7 +334,10 @@ impl RepNothingReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+}
 
+// RepNothingReplica recovery from WAL log
+impl RepNothingReplica {
     /// Recover state from durable storage log.
     async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
         assert_eq!(self.log_offset, 0);
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 82990870..5d2fb450 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -47,7 +47,6 @@ pub struct ReplicaConfigRSPaxos {
 
     /// Min timeout of not hearing any heartbeat from leader in millisecs.
     pub hb_hear_timeout_min: u64,
-
     /// Max timeout of not hearing any heartbeat from leader in millisecs.
     pub hb_hear_timeout_max: u64,
 
@@ -311,6 +310,7 @@ pub struct RSPaxosReplica {
     rs_coder: ReedSolomon,
 }
 
+// RSPaxosReplica common helpers
 impl RSPaxosReplica {
     /// Create an empty null instance.
     fn null_instance(&self) -> Result<Instance, SummersetError> {
@@ -383,7 +383,10 @@ impl RSPaxosReplica {
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
         (slot, cmd_idx)
     }
+}
 
+// RSPaxosReplica client requests entrance
+impl RSPaxosReplica {
     /// Handler of client request batch chan recv.
     fn handle_req_batch(
         &mut self,
@@ -545,7 +548,10 @@ impl RSPaxosReplica {
 
         Ok(())
     }
+}
 
+// RSPaxosReplica durable WAL logging
+impl RSPaxosReplica {
     /// Handler of PrepareBal logging result chan recv.
     fn handle_logged_prepare_bal(
         &mut self,
@@ -715,7 +721,10 @@ impl RSPaxosReplica {
             }
         }
     }
+}
 
+// RSPaxosReplica peer-peer messages handling
+impl RSPaxosReplica {
     /// Handler of Prepare message from leader.
     fn handle_msg_prepare(
         &mut self,
@@ -1185,7 +1194,10 @@ impl RSPaxosReplica {
             }
         }
     }
+}
 
+// RSPaxosReplica state machine execution
+impl RSPaxosReplica {
     /// Handler of state machine exec result chan recv.
     fn handle_cmd_result(
         &mut self,
@@ -1244,7 +1256,10 @@ impl RSPaxosReplica {
 
         Ok(())
     }
+}
 
+// RSPaxosReplica leadership related logic
+impl RSPaxosReplica {
     /// Becomes a leader, sends self-initiated Prepare messages to followers
     /// for all in-progress instances, and starts broadcasting heartbeats.
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
@@ -1380,7 +1395,10 @@ impl RSPaxosReplica {
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
         Ok(())
     }
+}
 
+// RSPaxosReplica control messages handling
+impl RSPaxosReplica {
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -1492,7 +1510,10 @@ impl RSPaxosReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+}
 
+// RSPaxosReplica recovery from WAL log
+impl RSPaxosReplica {
     /// Apply a durable storage log entry for recovery.
     async fn recover_apply_entry(
         &mut self,
@@ -1641,7 +1662,10 @@ impl RSPaxosReplica {
             logged_err!(self.id; "unexpected log result type or failed truncate")
         }
     }
+}
 
+// RSPaxosReplica snapshotting & GC logic
+impl RSPaxosReplica {
     /// Dump a new key-value pair to snapshot file.
     async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index ce89c7d1..c6a283c4 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -138,6 +138,7 @@ pub struct SimplePushReplica {
     log_offset: usize,
 }
 
+// SimplePushReplica common helpers
 impl SimplePushReplica {
     /// Compose CommandId from instance index & command index within.
     fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId {
@@ -152,7 +153,10 @@ impl SimplePushReplica {
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
         (inst_idx, cmd_idx)
     }
+}
 
+// SimplePushReplica client requests entrance
+impl SimplePushReplica {
     /// Handler of client request batch chan recv.
     fn handle_req_batch(
         &mut self,
@@ -208,7 +212,10 @@ impl SimplePushReplica {
 
         Ok(())
     }
+}
 
+// SimplePushReplica durable WAL logging
+impl SimplePushReplica {
     /// Handler of durable logging result chan recv.
     fn handle_log_result(
         &mut self,
@@ -265,7 +272,10 @@ impl SimplePushReplica {
 
         Ok(())
     }
+}
 
+// SimplePushReplica peer-peer messages handling
+impl SimplePushReplica {
     /// Handler of push message from peer.
     fn handle_push_msg(
         &mut self,
@@ -346,7 +356,10 @@ impl SimplePushReplica {
 
         Ok(())
     }
+}
 
+// SimplePushReplica state machine execution
+impl SimplePushReplica {
     /// Handler of state machine exec result chan recv.
     fn handle_cmd_result(
         &mut self,
@@ -398,7 +411,10 @@ impl SimplePushReplica {
 
         Ok(())
     }
+}
 
+// SimplePushReplica control messages handling
+impl SimplePushReplica {
     /// Handler of ResetState control message.
     async fn handle_ctrl_reset_state(
         &mut self,
@@ -486,7 +502,10 @@ impl SimplePushReplica {
             _ => Ok(None), // ignore all other types
         }
     }
+}
 
+// SimplePushReplica recovery from WAL log
+impl SimplePushReplica {
     /// Recover state from durable storage log.
     async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
         assert_eq!(self.log_offset, 0);

From 59666514e62bf07f7a3d2d3f6abc0eee98064187 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Mon, 25 Sep 2023 19:07:51 -0500
Subject: [PATCH 62/89] add perf sim skeleton

---
 perf_sim/perf_sim.py | 70 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 perf_sim/perf_sim.py

diff --git a/perf_sim/perf_sim.py b/perf_sim/perf_sim.py
new file mode 100644
index 00000000..c2cbc40f
--- /dev/null
+++ b/perf_sim/perf_sim.py
@@ -0,0 +1,70 @@
+import simpy
+
+
+class Data:
+    def __init__(self, mark, size):
+        self.mark = mark
+        self.size = size
+
+
+class NetLink:
+    def __init__(self, env, a, b):
+        self.env = env
+        self.a = a
+        self.b = b
+        self.store = simpy.Store(env)
+
+    def delay(self, data):
+        delay = self.a + self.b * data.size
+        yield self.env.timeout(delay)
+        self.store.put(data)
+
+    def send(self, data):
+        self.env.process(self.delay(data))
+
+    def recv(self):
+        return self.store.get()
+
+
+class DiskDev:
+    def __init__(self, env, a, b):
+        self.env = env
+        self.a = a
+        self.b = b
+
+    def delay(self, data):
+        delay = self.a + self.b * data.size
+        yield self.env.timeout(delay)
+
+    def save(self, data):
+        self.env.process(self.delay(data))
+
+
+class Replica:
+    def __init__(self, env, is_leader, disk_a, disk_b):
+        self.env = env
+        self.is_leader = is_leader
+        self.disk_dev = DiskDev(env, disk_a, disk_b)
+        self.peers = dict()
+        self.net_links = dict()
+
+    def add_peer(self, name, peer, net_a, net_b):
+        self.peers[name] = peer
+        self.net_links[name] = NetLink(self.env, net_a, net_b)
+
+    def run(self):
+        while True:
+            yield self.env.timeout(3)
+            print(f"req")
+            yield self.disk_dev.save(Data("d", 2))
+            print(f"saved")
+
+
+class Cluster:
+    def __init__(self, num_replicas):
+        pass
+
+
+if __name__ == "__main__":
+    env = simpy.Environment()
+    env.run(until=15)

From 6e341671003338ce58cb8b34639fe406ff20e594 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 28 Sep 2023 18:37:36 -0500
Subject: [PATCH 63/89] add constraint boundary figure script

---
 perf_sim/perf_sim.py         |   2 +-
 perf_sim/plot_cstr_bounds.py | 189 +++++++++++++++++++++++++++++++++++
 2 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 perf_sim/plot_cstr_bounds.py

diff --git a/perf_sim/perf_sim.py b/perf_sim/perf_sim.py
index c2cbc40f..40d85d00 100644
--- a/perf_sim/perf_sim.py
+++ b/perf_sim/perf_sim.py
@@ -1,4 +1,4 @@
-import simpy
+import simpy  # type: ignore
 
 
 class Data:
diff --git a/perf_sim/plot_cstr_bounds.py b/perf_sim/plot_cstr_bounds.py
new file mode 100644
index 00000000..091f6f70
--- /dev/null
+++ b/perf_sim/plot_cstr_bounds.py
@@ -0,0 +1,189 @@
+import matplotlib  # type: ignore
+
+matplotlib.use("Agg")
+
+import matplotlib.pyplot as plt  # type: ignore
+import matplotlib.patches as mpatches  # type: ignore
+from matplotlib.legend_handler import HandlerPatch  # type: ignore
+import math
+
+
+SUBPLOT_ARG = lambda idx: 141 + idx
+
+CLUSTER_SIZES = [3, 5, 7, 9]
+SIZE_COLOR_MAP = {
+    3: ("orange", "bisque"),
+    5: ("seagreen", "lightgreen"),
+    7: ("steelblue", "skyblue"),
+    9: ("chocolate", "mistyrose"),
+}
+
+X_TICKS = list(range(1, 10))
+Y_TICKS = list(range(1, 6))
+
+
+def plot_cstr_bound(idx, cluster_size):
+    ax = plt.subplot(SUBPLOT_ARG(idx))
+
+    n = cluster_size
+    f = n // 2
+    m = n - f
+
+    line_color, fill_color = SIZE_COLOR_MAP[cluster_size]
+
+    # Classic Paxos/Raft point
+    plt.scatter(
+        m, m, marker="D", s=100, color="dimgray", label="Classic Paxos/Raft", zorder=10
+    )
+
+    # CRaft point
+    craft_q = math.ceil((n + m) / 2)
+    plt.scatter(
+        craft_q,
+        1,
+        marker="X",
+        s=100,
+        color="lightcoral",
+        label="RS-Paxos/CRaft",
+        zorder=10,
+    )
+
+    # boundary lines
+    xs = [x for x in range(m, n + 1)]
+    ys = [x for x in range(m, 0, -1)]
+    plt.plot(
+        xs,
+        ys,
+        linewidth=2,
+        marker="o",
+        markersize=7,
+        color=line_color,
+        label="Crossword configs",
+        zorder=20,
+    )
+    plt.vlines(m, ymin=m, ymax=m + 1.5, linestyles="--", color=line_color, zorder=20)
+    plt.vlines(n, ymin=1, ymax=2.5, linestyles="--", color=line_color, zorder=20)
+
+    # correct region
+    xs = [m, m, n, n]
+    ys = [m, m + 1, 2, 1]
+    plt.fill(xs, ys, color=fill_color, label="Correct region", zorder=0)
+
+    # latency & throughput optimized arrows
+    plt.arrow(
+        m + 0.3,
+        m + 1.7,
+        -0.9,
+        0.9,
+        linewidth=1,
+        color="dimgray",
+        length_includes_head=True,
+        head_width=0.3,
+        overhang=0.5,
+        label="Tradeoff decisions",
+    )
+    plt.text(
+        m + 0.18 if n == 3 else m + 0.5 if n == 9 else m + 0.4,
+        m + 2.78 if n == 3 else m + 2.0 if n == 9 else m + 2.4,
+        "Lat.\noptim.",
+        horizontalalignment="left",
+        verticalalignment="center",
+        color="dimgray",
+    )
+    plt.arrow(
+        n - 0.3,
+        3.3,
+        0.9,
+        -0.9,
+        linewidth=1,
+        color="dimgray",
+        length_includes_head=True,
+        head_width=0.3,
+        overhang=0.5,
+    )
+    plt.text(
+        n + 0.8 if n == 3 else n + 0.0 if n == 9 else n + 0.4,
+        1 + 1.5 if n == 3 else 1 + 2.9 if n == 9 else 1 + 2.6,
+        "Tput.\noptim.",
+        horizontalalignment="left",
+        verticalalignment="center",
+        color="dimgray",
+    )
+
+    plt.axis("scaled")
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+
+    plt.xlim((0, X_TICKS[-1] + 0.7))
+    plt.ylim((0, Y_TICKS[-1] + 2.7))
+    plt.xticks(X_TICKS, list(map(str, X_TICKS)))
+    plt.yticks(Y_TICKS, list(map(str, Y_TICKS)))
+
+    plt.xlabel("|Quorum|", loc="right")
+    plt.ylabel("#Shards\n/replica", loc="top", rotation=0, backgroundcolor="white")
+    ax.xaxis.set_label_coords(1.05, -0.18)
+    ax.yaxis.set_label_coords(0.2, 0.8)
+
+    plt.title(
+        f"|Cluster|={n}  f={f}",
+        x=0.27,
+        y=-0.38,
+        fontsize=10,
+        fontweight="bold",
+        backgroundcolor=fill_color,
+    )
+
+    return ax
+
+
+def plot_all_cstr_bounds():
+    matplotlib.rcParams.update(
+        {
+            "figure.figsize": (10, 3),
+            "font.size": 10,
+            "axes.axisbelow": False,
+        }
+    )
+    fig = plt.figure()
+
+    handles, labels = None, None
+    for idx, cluster_size in enumerate(CLUSTER_SIZES):
+        ax = plot_cstr_bound(idx, cluster_size)
+        if idx == len(CLUSTER_SIZES) - 1:
+            handles, labels = ax.get_legend_handles_labels()
+
+    def make_legend_arrow(
+        legend, orig_handle, xdescent, ydescent, width, height, fontsize
+    ):
+        return mpatches.FancyArrow(
+            0,
+            0.5 * height,
+            width,
+            0,
+            linewidth=1,
+            color="dimgray",
+            length_includes_head=True,
+            head_width=0.75 * height,
+            overhang=0.3,
+        )
+
+    # single legend group on top
+    handles = handles[-2:] + handles[:-2]
+    labels = labels[-2:] + labels[:-2]
+    fig.legend(
+        handles,
+        labels,
+        loc="lower center",
+        bbox_to_anchor=(0.5, 0.78),
+        ncol=len(handles),
+        handlelength=1.5,
+        handletextpad=0.5,
+        handler_map={mpatches.FancyArrow: HandlerPatch(patch_func=make_legend_arrow)},
+    )
+
+    plt.tight_layout()
+    plt.savefig(f"results/cstr_bounds.png", dpi=300)
+
+
+if __name__ == "__main__":
+    plot_all_cstr_bounds()

From 4514ac611dcecb7b3bf76aedd21bf501783915a1 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 28 Sep 2023 22:41:43 -0500
Subject: [PATCH 64/89] update constraint boundary figure script

---
 perf_sim/plot_cstr_bounds.py | 107 +++++++++++++++++++++++------------
 1 file changed, 70 insertions(+), 37 deletions(-)

diff --git a/perf_sim/plot_cstr_bounds.py b/perf_sim/plot_cstr_bounds.py
index 091f6f70..a9c49cfc 100644
--- a/perf_sim/plot_cstr_bounds.py
+++ b/perf_sim/plot_cstr_bounds.py
@@ -2,6 +2,7 @@
 
 matplotlib.use("Agg")
 
+import numpy as np  # type: ignore
 import matplotlib.pyplot as plt  # type: ignore
 import matplotlib.patches as mpatches  # type: ignore
 from matplotlib.legend_handler import HandlerPatch  # type: ignore
@@ -12,9 +13,9 @@
 
 CLUSTER_SIZES = [3, 5, 7, 9]
 SIZE_COLOR_MAP = {
-    3: ("orange", "bisque"),
-    5: ("seagreen", "lightgreen"),
-    7: ("steelblue", "skyblue"),
+    3: ("seagreen", "palegreen"),
+    5: ("orange", "bisque"),
+    7: ("steelblue", "powderblue"),
     9: ("chocolate", "mistyrose"),
 }
 
@@ -33,7 +34,7 @@ def plot_cstr_bound(idx, cluster_size):
 
     # Classic Paxos/Raft point
     plt.scatter(
-        m, m, marker="D", s=100, color="dimgray", label="Classic Paxos/Raft", zorder=10
+        m, m, marker="s", s=100, color="black", label="Classic Paxos/Raft", zorder=10
     )
 
     # CRaft point
@@ -42,7 +43,7 @@ def plot_cstr_bound(idx, cluster_size):
         craft_q,
         1,
         marker="X",
-        s=100,
+        s=110,
         color="lightcoral",
         label="RS-Paxos/CRaft",
         zorder=10,
@@ -61,13 +62,13 @@ def plot_cstr_bound(idx, cluster_size):
         label="Crossword configs",
         zorder=20,
     )
-    plt.vlines(m, ymin=m, ymax=m + 1.5, linestyles="--", color=line_color, zorder=20)
-    plt.vlines(n, ymin=1, ymax=2.5, linestyles="--", color=line_color, zorder=20)
+    plt.vlines(m, ymin=m, ymax=m + 1.5, linestyles="-", color=line_color, zorder=20)
+    plt.vlines(n, ymin=1, ymax=2.5, linestyles="-", color=line_color, zorder=20)
 
     # correct region
     xs = [m, m, n, n]
     ys = [m, m + 1, 2, 1]
-    plt.fill(xs, ys, color=fill_color, label="Correct region", zorder=0)
+    plt.fill(xs, ys, color=fill_color, label="Region of fault-tolerance=f", zorder=0)
 
     # latency & throughput optimized arrows
     plt.arrow(
@@ -83,8 +84,8 @@ def plot_cstr_bound(idx, cluster_size):
         label="Tradeoff decisions",
     )
     plt.text(
-        m + 0.18 if n == 3 else m + 0.5 if n == 9 else m + 0.4,
-        m + 2.78 if n == 3 else m + 2.0 if n == 9 else m + 2.4,
+        m + 0.18 if n <= 5 else m + 0.5 if n == 9 else m + 0.4,
+        m + 2.78 if n <= 5 else m + 2.0 if n == 9 else m + 2.4,
         "Lat.\noptim.",
         horizontalalignment="left",
         verticalalignment="center",
@@ -102,8 +103,8 @@ def plot_cstr_bound(idx, cluster_size):
         overhang=0.5,
     )
     plt.text(
-        n + 0.8 if n == 3 else n + 0.0 if n == 9 else n + 0.4,
-        1 + 1.5 if n == 3 else 1 + 2.9 if n == 9 else 1 + 2.6,
+        n + 0.8 if n <= 5 else n + 0.0 if n == 9 else n + 0.4,
+        1 + 1.5 if n <= 5 else 1 + 2.9 if n == 9 else 1 + 2.6,
         "Tput.\noptim.",
         horizontalalignment="left",
         verticalalignment="center",
@@ -126,7 +127,7 @@ def plot_cstr_bound(idx, cluster_size):
 
     plt.title(
         f"|Cluster|={n}  f={f}",
-        x=0.27,
+        x=0.3,
         y=-0.38,
         fontsize=10,
         fontweight="bold",
@@ -136,22 +137,7 @@ def plot_cstr_bound(idx, cluster_size):
     return ax
 
 
-def plot_all_cstr_bounds():
-    matplotlib.rcParams.update(
-        {
-            "figure.figsize": (10, 3),
-            "font.size": 10,
-            "axes.axisbelow": False,
-        }
-    )
-    fig = plt.figure()
-
-    handles, labels = None, None
-    for idx, cluster_size in enumerate(CLUSTER_SIZES):
-        ax = plot_cstr_bound(idx, cluster_size)
-        if idx == len(CLUSTER_SIZES) - 1:
-            handles, labels = ax.get_legend_handles_labels()
-
+def make_legend(fig, handles, labels):
     def make_legend_arrow(
         legend, orig_handle, xdescent, ydescent, width, height, fontsize
     ):
@@ -163,23 +149,70 @@ def make_legend_arrow(
             linewidth=1,
             color="dimgray",
             length_includes_head=True,
-            head_width=0.75 * height,
+            head_width=0.6 * height,
             overhang=0.3,
         )
 
-    # single legend group on top
-    handles = handles[-2:] + handles[:-2]
-    labels = labels[-2:] + labels[:-2]
-    fig.legend(
-        handles,
-        labels,
+    def make_legend_polygon(
+        legend, orig_handle, xdescent, ydescent, width, height, fontsize
+    ):
+        return mpatches.Polygon(
+            xy=np.array(
+                [
+                    [0.2 * width, 0.5 * height],
+                    [0.2 * width, 1.2 * height],
+                    [0.8 * width, 0.5 * height],
+                    [0.8 * width, -0.2 * height],
+                ]
+            ),
+            closed=True,
+            color="dimgray",
+        )
+
+    order = []
+    for s in ("Classic", "RS-", "Crossword", "Region", "Tradeoff"):
+        for i, l in enumerate(labels):
+            if s in l:
+                order.append(i)
+                break
+    sorted_handles = [handles[i] for i in order]
+    sorted_labels = [labels[i] for i in order]
+
+    leg = fig.legend(
+        sorted_handles,
+        sorted_labels,
         loc="lower center",
         bbox_to_anchor=(0.5, 0.78),
         ncol=len(handles),
         handlelength=1.5,
         handletextpad=0.5,
-        handler_map={mpatches.FancyArrow: HandlerPatch(patch_func=make_legend_arrow)},
+        handler_map={
+            mpatches.FancyArrow: HandlerPatch(patch_func=make_legend_arrow),
+            mpatches.Polygon: HandlerPatch(patch_func=make_legend_polygon),
+        },
+    )
+    for h in leg.legend_handles[2:]:
+        h.set_color("dimgray")
+
+
+def plot_all_cstr_bounds():
+    matplotlib.rcParams.update(
+        {
+            "figure.figsize": (10, 3),
+            "font.size": 10,
+            "axes.axisbelow": False,
+        }
     )
+    fig = plt.figure()
+
+    handles, labels = None, None
+    for idx, cluster_size in enumerate(CLUSTER_SIZES):
+        ax = plot_cstr_bound(idx, cluster_size)
+        if idx == len(CLUSTER_SIZES) - 1:
+            handles, labels = ax.get_legend_handles_labels()
+
+    # single legend group on top
+    make_legend(fig, handles, labels)
 
     plt.tight_layout()
     plt.savefig(f"results/cstr_bounds.png", dpi=300)

From 94906503a5b1a8f571ed51a39e60dde6ef67c237 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sat, 30 Sep 2023 07:57:27 -0500
Subject: [PATCH 65/89] rename perf_sim/ to models/

---
 {perf_sim => models}/perf_sim.py         | 0
 {perf_sim => models}/plot_cstr_bounds.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename {perf_sim => models}/perf_sim.py (100%)
 rename {perf_sim => models}/plot_cstr_bounds.py (100%)

diff --git a/perf_sim/perf_sim.py b/models/perf_sim.py
similarity index 100%
rename from perf_sim/perf_sim.py
rename to models/perf_sim.py
diff --git a/perf_sim/plot_cstr_bounds.py b/models/plot_cstr_bounds.py
similarity index 100%
rename from perf_sim/plot_cstr_bounds.py
rename to models/plot_cstr_bounds.py

From ae8cb3bc1d381d315198bfe7dd4b39bc73ea5ab3 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sat, 30 Sep 2023 09:00:00 -0500
Subject: [PATCH 66/89] staging progress on perf sim

---
 models/perf_sim.py | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/models/perf_sim.py b/models/perf_sim.py
index 40d85d00..3a776b1c 100644
--- a/models/perf_sim.py
+++ b/models/perf_sim.py
@@ -41,16 +41,15 @@ def save(self, data):
 
 
 class Replica:
-    def __init__(self, env, is_leader, disk_a, disk_b):
+    def __init__(self, env, rid, disk_ab):
         self.env = env
-        self.is_leader = is_leader
-        self.disk_dev = DiskDev(env, disk_a, disk_b)
-        self.peers = dict()
+        self.rid = rid
+        self.api = NetLink(env, 0, 0)
+        self.disk_dev = DiskDev(env, disk_ab[0], disk_ab[1])
         self.net_links = dict()
 
-    def add_peer(self, name, peer, net_a, net_b):
-        self.peers[name] = peer
-        self.net_links[name] = NetLink(self.env, net_a, net_b)
+    def add_peer(self, rid, net_ab):
+        self.net_links[rid] = NetLink(self.env, net_ab[0], net_ab[1])
 
     def run(self):
         while True:
@@ -59,10 +58,30 @@ def run(self):
             yield self.disk_dev.save(Data("d", 2))
             print(f"saved")
 
+    def req(self, data):
+        self.api.send(data)
+
 
 class Cluster:
-    def __init__(self, num_replicas):
-        pass
+    def __init__(self, env, num_replicas, disk_perf_map, net_perf_map):
+        self.env = env
+        self.replicas = [
+            Replica(
+                env,
+                rid,
+                disk_perf_map[rid],
+            )
+            for rid in range(num_replicas)
+        ]
+        self.leader = self.replicas[0]
+
+        for replica in self.replicas:
+            for rid in range(num_replicas):
+                if rid != replica.rid:
+                    replica.add_peer(
+                        rid,
+                        net_perf_map[(replica.rid, rid)],
+                    )
 
 
 if __name__ == "__main__":

From 94b8f94111aa868b950a16328ea90f6c796e506c Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sat, 30 Sep 2023 13:19:03 -0500
Subject: [PATCH 67/89] staging progress on perf sim

---
 models/perf_sim.py     | 218 ++++++++++++++++++++++++++++++++++-------
 src/server/external.rs |   3 +-
 2 files changed, 183 insertions(+), 38 deletions(-)

diff --git a/models/perf_sim.py b/models/perf_sim.py
index 3a776b1c..512b6ef5 100644
--- a/models/perf_sim.py
+++ b/models/perf_sim.py
@@ -1,4 +1,10 @@
 import simpy  # type: ignore
+from enum import Enum  # type: ignore
+
+
+##############
+# Data types #
+##############
 
 
 class Data:
@@ -6,82 +12,220 @@ def __init__(self, mark, size):
         self.mark = mark
         self.size = size
 
+    def __str__(self):
+        return f"<{self.mark};{self.size}>"
+
+
+class Req(Data):
+    def __init__(self, mark, size):
+        super().__init__(mark, size)
+
+
+class Batch(Data):
+    def __init__(self, mark, vec):
+        self.vec = vec
+        size = sum((data.size for data in vec))
+        super().__init__(mark, size)
+
+
+###############
+# Event types #
+###############
+
+
+class EType(Enum):
+    NetRecved = 1
+    DiskSaved = 2
+    ApiBatch = 3
+
+
+class Event:
+    def __init__(self, enum, info, value):
+        self.enum = enum
+        self.info = info
+        self.value = value
+
+    def __str__(self):
+        return f"{{{self.enum}|{self.info}|{self.value}}}"
 
-class NetLink:
-    def __init__(self, env, a, b):
+
+class NetRecved(Event):
+    def __init__(self, src, msg):
+        super().__init__(EType.NetRecved, src, msg)
+
+
+class DiskSaved(Event):
+    def __init__(self, mark):
+        super().__init__(EType.DiskSaved, None, mark)
+
+
+class ApiBatch(Event):
+    def __init__(self, batch):
+        super().__init__(EType.ApiBatch, None, batch)
+
+
+###################
+# Component types #
+###################
+
+
+class Device:
+    def __init__(self, env, l, t, v):
         self.env = env
-        self.a = a
-        self.b = b
-        self.store = simpy.Store(env)
+        self.l = l
+        self.t = t
+        self.v = v  # TODO: use this
+        self.pipe = simpy.Store(env)
 
     def delay(self, data):
-        delay = self.a + self.b * data.size
+        delay = self.l + self.t * data.size
         yield self.env.timeout(delay)
-        self.store.put(data)
+        self.pipe.put(data)
 
-    def send(self, data):
-        self.env.process(self.delay(data))
+
+class NetLink(Device):
+    def __init__(self, env, l, t, v, src, dst):
+        self.src = src
+        self.dst = dst
+        super().__init__(env, l, t, v)
+
+    def send(self, msg):
+        self.env.process(self.delay(msg))
 
     def recv(self):
-        return self.store.get()
+        msg = yield self.pipe.get()
+        return NetRecved(self.src, msg)
 
 
-class DiskDev:
-    def __init__(self, env, a, b):
+class DiskDev(Device):
+    def __init__(self, env, l, t, v, rid):
+        self.rid = rid
+        super().__init__(env, l, t, v)
+
+    def write(self, ent):
+        self.env.process(self.delay(ent))
+
+    def saved(self):
+        ent = yield self.pipe.get()
+        return DiskSaved(ent.mark)
+
+
+class ExtlApi:
+    def __init__(self, env, b, rid):
         self.env = env
-        self.a = a
         self.b = b
+        self.rid = rid
+        self.mark = 0
+        self.ibuf = []
+        self.tick = simpy.Container()
+
+        self.env.process(self.ticker())
+
+    def ticker(self):
+        while True:
+            yield self.env.timeout(self.b)
+            self.tick.put(1)
+
+    def req(self, req):
+        self.ibuf.append(req)
+
+    def batch(self):
+        yield self.tick.get(1)
+        self.tick.level = 0
+
+        batch = Batch(self.mark, self.ibuf)
+        self.mark += 1
+        self.ibuf = []
+        return ApiBatch(batch)
 
-    def delay(self, data):
-        delay = self.a + self.b * data.size
-        yield self.env.timeout(delay)
 
-    def save(self, data):
-        self.env.process(self.delay(data))
+#####################
+# Replica & Cluster #
+#####################
 
 
 class Replica:
-    def __init__(self, env, rid, disk_ab):
+    def __init__(self, env, rid, api_b, disk_ltv):
         self.env = env
         self.rid = rid
-        self.api = NetLink(env, 0, 0)
-        self.disk_dev = DiskDev(env, disk_ab[0], disk_ab[1])
-        self.net_links = dict()
-
-    def add_peer(self, rid, net_ab):
-        self.net_links[rid] = NetLink(self.env, net_ab[0], net_ab[1])
+        self.extl_api = ExtlApi(env, api_b)
+        self.disk_dev = DiskDev(env, disk_ltv[0], disk_ltv[1], disk_ltv[2])
+        self.send_links = dict()
+        self.recv_links = dict()
+
+    def add_peer(self, peer, net_ltv):
+        s2p_link = NetLink(
+            self.env, net_ltv[0], net_ltv[1], net_ltv[2], self.rid, peer.rid
+        )
+        p2s_link = NetLink(
+            self.env, net_ltv[0], net_ltv[1], net_ltv[2], peer.rid, self.rid
+        )
+        self.send_links[peer.rid] = s2p_link
+        self.recv_links[peer.rid] = p2s_link
+        peer.send_links[self.rid] = p2s_link
+        peer.recv_links[self.rid] = s2p_link
 
     def run(self):
         while True:
-            yield self.env.timeout(3)
-            print(f"req")
-            yield self.disk_dev.save(Data("d", 2))
-            print(f"saved")
+            events = [
+                self.env.process(self.extl_api.batch()),
+                self.env.process(self.disk_dev.saved()),
+            ]
+            for link in self.recv_links:
+                events.append(self.env.process(link.recv()))
+            event = yield self.env.any_of(events)
+
+            print(event)
 
     def req(self, data):
-        self.api.send(data)
+        self.extl_api.req(data)
 
 
 class Cluster:
-    def __init__(self, env, num_replicas, disk_perf_map, net_perf_map):
+    def __init__(self, env, num_replicas, api_b, disk_perf_map, net_perf_map):
         self.env = env
         self.replicas = [
             Replica(
                 env,
                 rid,
+                api_b,
                 disk_perf_map[rid],
             )
             for rid in range(num_replicas)
         ]
         self.leader = self.replicas[0]
 
+        for rid, replica in enumerate(self.replicas):
+            for peerid in range(rid + 1, num_replicas):
+                peer = self.replicas[peerid]
+                replica.add_peer(peer, net_perf_map[{rid, peerid}])
+
+    def launch(self):
         for replica in self.replicas:
-            for rid in range(num_replicas):
-                if rid != replica.rid:
-                    replica.add_peer(
-                        rid,
-                        net_perf_map[(replica.rid, rid)],
-                    )
+            self.env.process(replica.run())
+
+    def req(self, data):
+        self.leader.req(data)
+
+
+##########
+# Client #
+##########
+
+
+class Client:
+    def __init__(self, env, cluster, freq):
+        self.env = env
+        self.service = cluster
+        self.gap = 1.0 / freq
+
+    def driver(self):
+        while True:
+            yield self.env.timeout(self.gap)
+            self.service.req(Req("TODO", 8))
+
+    def start(self):
+        self.env.process(self.driver())
 
 
 if __name__ == "__main__":
diff --git a/src/server/external.rs b/src/server/external.rs
index cc820c00..a0728861 100644
--- a/src/server/external.rs
+++ b/src/server/external.rs
@@ -21,7 +21,7 @@ use tokio::io::AsyncReadExt;
 use tokio::sync::{mpsc, Notify};
 use tokio::sync::mpsc::error::TryRecvError;
 use tokio::task::JoinHandle;
-use tokio::time::{self, Duration};
+use tokio::time::{self, Duration, MissedTickBehavior};
 
 /// External API request ID type.
 pub type RequestId = u64;
@@ -490,6 +490,7 @@ impl ExternalApi {
         batch_notify: Arc<Notify>,
     ) {
         let mut interval = time::interval(batch_interval);
+        interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
         loop {
             interval.tick().await;

From 41e1ee857abd3d451c7aca2df10cb49ea23e5760 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sat, 30 Sep 2023 20:20:01 -0500
Subject: [PATCH 68/89] staging progress on perf sim

---
 models/perf_sim.py        | 233 -------------------
 models/perf_simulation.py | 477 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 477 insertions(+), 233 deletions(-)
 delete mode 100644 models/perf_sim.py
 create mode 100644 models/perf_simulation.py

diff --git a/models/perf_sim.py b/models/perf_sim.py
deleted file mode 100644
index 512b6ef5..00000000
--- a/models/perf_sim.py
+++ /dev/null
@@ -1,233 +0,0 @@
-import simpy  # type: ignore
-from enum import Enum  # type: ignore
-
-
-##############
-# Data types #
-##############
-
-
-class Data:
-    def __init__(self, mark, size):
-        self.mark = mark
-        self.size = size
-
-    def __str__(self):
-        return f"<{self.mark};{self.size}>"
-
-
-class Req(Data):
-    def __init__(self, mark, size):
-        super().__init__(mark, size)
-
-
-class Batch(Data):
-    def __init__(self, mark, vec):
-        self.vec = vec
-        size = sum((data.size for data in vec))
-        super().__init__(mark, size)
-
-
-###############
-# Event types #
-###############
-
-
-class EType(Enum):
-    NetRecved = 1
-    DiskSaved = 2
-    ApiBatch = 3
-
-
-class Event:
-    def __init__(self, enum, info, value):
-        self.enum = enum
-        self.info = info
-        self.value = value
-
-    def __str__(self):
-        return f"{{{self.enum}|{self.info}|{self.value}}}"
-
-
-class NetRecved(Event):
-    def __init__(self, src, msg):
-        super().__init__(EType.NetRecved, src, msg)
-
-
-class DiskSaved(Event):
-    def __init__(self, mark):
-        super().__init__(EType.DiskSaved, None, mark)
-
-
-class ApiBatch(Event):
-    def __init__(self, batch):
-        super().__init__(EType.ApiBatch, None, batch)
-
-
-###################
-# Component types #
-###################
-
-
-class Device:
-    def __init__(self, env, l, t, v):
-        self.env = env
-        self.l = l
-        self.t = t
-        self.v = v  # TODO: use this
-        self.pipe = simpy.Store(env)
-
-    def delay(self, data):
-        delay = self.l + self.t * data.size
-        yield self.env.timeout(delay)
-        self.pipe.put(data)
-
-
-class NetLink(Device):
-    def __init__(self, env, l, t, v, src, dst):
-        self.src = src
-        self.dst = dst
-        super().__init__(env, l, t, v)
-
-    def send(self, msg):
-        self.env.process(self.delay(msg))
-
-    def recv(self):
-        msg = yield self.pipe.get()
-        return NetRecved(self.src, msg)
-
-
-class DiskDev(Device):
-    def __init__(self, env, l, t, v, rid):
-        self.rid = rid
-        super().__init__(env, l, t, v)
-
-    def write(self, ent):
-        self.env.process(self.delay(ent))
-
-    def saved(self):
-        ent = yield self.pipe.get()
-        return DiskSaved(ent.mark)
-
-
-class ExtlApi:
-    def __init__(self, env, b, rid):
-        self.env = env
-        self.b = b
-        self.rid = rid
-        self.mark = 0
-        self.ibuf = []
-        self.tick = simpy.Container()
-
-        self.env.process(self.ticker())
-
-    def ticker(self):
-        while True:
-            yield self.env.timeout(self.b)
-            self.tick.put(1)
-
-    def req(self, req):
-        self.ibuf.append(req)
-
-    def batch(self):
-        yield self.tick.get(1)
-        self.tick.level = 0
-
-        batch = Batch(self.mark, self.ibuf)
-        self.mark += 1
-        self.ibuf = []
-        return ApiBatch(batch)
-
-
-#####################
-# Replica & Cluster #
-#####################
-
-
-class Replica:
-    def __init__(self, env, rid, api_b, disk_ltv):
-        self.env = env
-        self.rid = rid
-        self.extl_api = ExtlApi(env, api_b)
-        self.disk_dev = DiskDev(env, disk_ltv[0], disk_ltv[1], disk_ltv[2])
-        self.send_links = dict()
-        self.recv_links = dict()
-
-    def add_peer(self, peer, net_ltv):
-        s2p_link = NetLink(
-            self.env, net_ltv[0], net_ltv[1], net_ltv[2], self.rid, peer.rid
-        )
-        p2s_link = NetLink(
-            self.env, net_ltv[0], net_ltv[1], net_ltv[2], peer.rid, self.rid
-        )
-        self.send_links[peer.rid] = s2p_link
-        self.recv_links[peer.rid] = p2s_link
-        peer.send_links[self.rid] = p2s_link
-        peer.recv_links[self.rid] = s2p_link
-
-    def run(self):
-        while True:
-            events = [
-                self.env.process(self.extl_api.batch()),
-                self.env.process(self.disk_dev.saved()),
-            ]
-            for link in self.recv_links:
-                events.append(self.env.process(link.recv()))
-            event = yield self.env.any_of(events)
-
-            print(event)
-
-    def req(self, data):
-        self.extl_api.req(data)
-
-
-class Cluster:
-    def __init__(self, env, num_replicas, api_b, disk_perf_map, net_perf_map):
-        self.env = env
-        self.replicas = [
-            Replica(
-                env,
-                rid,
-                api_b,
-                disk_perf_map[rid],
-            )
-            for rid in range(num_replicas)
-        ]
-        self.leader = self.replicas[0]
-
-        for rid, replica in enumerate(self.replicas):
-            for peerid in range(rid + 1, num_replicas):
-                peer = self.replicas[peerid]
-                replica.add_peer(peer, net_perf_map[{rid, peerid}])
-
-    def launch(self):
-        for replica in self.replicas:
-            self.env.process(replica.run())
-
-    def req(self, data):
-        self.leader.req(data)
-
-
-##########
-# Client #
-##########
-
-
-class Client:
-    def __init__(self, env, cluster, freq):
-        self.env = env
-        self.service = cluster
-        self.gap = 1.0 / freq
-
-    def driver(self):
-        while True:
-            yield self.env.timeout(self.gap)
-            self.service.req(Req("TODO", 8))
-
-    def start(self):
-        self.env.process(self.driver())
-
-
-if __name__ == "__main__":
-    env = simpy.Environment()
-    env.run(until=15)
diff --git a/models/perf_simulation.py b/models/perf_simulation.py
new file mode 100644
index 00000000..6d5f29b2
--- /dev/null
+++ b/models/perf_simulation.py
@@ -0,0 +1,477 @@
+import simpy  # type: ignore
+from enum import Enum  # type: ignore
+
+
+##############
+# Data types #
+##############
+
+
+class Data:
+    def __init__(self, mark, size):
+        self.mark = mark
+        self.size = size
+
+    def __str__(self):
+        return f"<{self.mark};{self.size}>"
+
+
+class Req(Data):
+    def __init__(self, cid, mark, size):
+        self.cid = cid
+        super().__init__(mark, size)
+
+
+class Batch(Data):
+    def __init__(self, mark, vec):
+        self.vec = vec
+        size = sum((data.size for data in vec))
+        super().__init__(mark, size)
+
+
+###############
+# Event types #
+###############
+
+
+class EType(Enum):
+    NetRecved = 1
+    DiskSaved = 2
+    ApiBatch = 3
+    SendNewReq = 4
+
+
+class Event:
+    def __init__(self, enum, info, value):
+        self.enum = enum
+        self.info = info
+        self.value = value
+
+    def __str__(self):
+        return f"{{{self.enum}|{self.info}|{self.value}}}"
+
+
+class NetRecved(Event):
+    def __init__(self, src, msg):
+        super().__init__(EType.NetRecved, src, msg)
+
+
+class DiskSaved(Event):
+    def __init__(self, mark):
+        super().__init__(EType.DiskSaved, None, mark)
+
+
+class ApiBatch(Event):
+    def __init__(self, batch):
+        super().__init__(EType.ApiBatch, None, batch)
+
+
+class SendNewReq(Event):
+    def __init__(self, mark):
+        super().__init__(EType.SendNewReq, None, mark)
+
+
+###################
+# Component types #
+###################
+
+
+class Device:
+    def __init__(self, env, l, t, v):
+        self.env = env
+        self.l = l
+        self.t = t
+        self.v = v  # TODO: use this
+        self.pipe = simpy.Store(env)
+
+    def delay(self, data):
+        delay = self.l + self.t * data.size
+        yield self.env.timeout(delay)
+        self.pipe.put(data)
+
+
+class NetLink(Device):
+    def __init__(self, env, l, t, v, src, dst):
+        self.src = src
+        self.dst = dst
+        super().__init__(env, l, t, v)
+
+    def send(self, msg):
+        if self.src == 1 and self.dst == 0:
+            print("!!!", msg)
+        self.env.process(self.delay(msg))
+
+    def recv(self):
+        if self.src == 1 and self.dst == 0:
+            print("???", self.env.now, self.pipe.items)
+        msg = yield self.pipe.get()
+        return NetRecved(self.src, msg)
+
+
+class DiskDev(Device):
+    def __init__(self, env, l, t, v, rid):
+        self.rid = rid
+        super().__init__(env, l, t, v)
+
+    def write(self, ent):
+        self.env.process(self.delay(ent))
+
+    def saved(self):
+        ent = yield self.pipe.get()
+        return DiskSaved(ent.mark)
+
+
+class ExtlApi:
+    def __init__(self, env, l, t, v, b, rid):
+        self.env = env
+        self.l = l
+        self.t = t
+        self.v = v
+        self.b = b
+
+        self.rid = rid
+        self.req_links = dict()
+        self.ack_links = dict()
+
+        self.mark = 0
+        self.tick = simpy.Container(env, capacity=1)
+
+        self.env.process(self.ticker())
+
+    def connect(self, client):
+        req_link = NetLink(self.env, self.l, self.t, self.v, client.cid, self.rid)
+        ack_link = NetLink(self.env, self.l, self.t, self.v, self.rid, client.cid)
+        self.req_links[client.cid] = req_link
+        self.ack_links[client.cid] = ack_link
+        return (req_link, ack_link)
+
+    def ticker(self):
+        while True:
+            yield self.env.timeout(self.b)
+            if self.tick.level == 0:
+                self.tick.put(1)
+
+    def batch(self):
+        while True:
+            yield self.tick.get(1)
+
+            reqs = []
+            for link in self.req_links.values():
+                if len(link.pipe.items) > 0:
+                    reqs += link.pipe.items
+                    link.pipe.items = []
+
+            # do not return if no reqs available at this tick
+            if len(reqs) == 0:
+                continue
+
+            batch = Batch(self.mark, reqs)
+            self.mark += 1
+            return ApiBatch(batch)
+
+    def ack(self, cid, mark):
+        if cid not in self.ack_links:
+            raise RuntimeError(f"cid {cid} not in connected")
+        self.ack_links[cid].send(mark)
+
+
+#####################
+# Replica & Cluster #
+#####################
+
+
+class Replica:
+    def __init__(self, env, rid, api_ltvb, disk_ltv, protocol, **protocol_args):
+        self.env = env
+        self.rid = rid
+        self.extl_api = ExtlApi(
+            env, api_ltvb[0], api_ltvb[1], api_ltvb[2], api_ltvb[3], rid
+        )
+        self.disk_dev = DiskDev(env, disk_ltv[0], disk_ltv[1], disk_ltv[2], rid)
+        self.send_links = dict()
+        self.recv_links = dict()
+
+        # protocol-specific fields & event handlers
+        self.protocol = protocol(self, **protocol_args)
+
+    def add_peer(self, peer, net_ltv):
+        s2p_link = NetLink(
+            self.env, net_ltv[0], net_ltv[1], net_ltv[2], self.rid, peer.rid
+        )
+        p2s_link = NetLink(
+            self.env, net_ltv[0], net_ltv[1], net_ltv[2], peer.rid, self.rid
+        )
+        self.send_links[peer.rid] = s2p_link
+        self.recv_links[peer.rid] = p2s_link
+        peer.send_links[self.rid] = p2s_link
+        peer.recv_links[self.rid] = s2p_link
+
+    def run(self):
+        while True:
+            events = [
+                self.env.process(self.extl_api.batch()),
+                self.env.process(self.disk_dev.saved()),
+            ]
+            for link in self.recv_links.values():
+                events.append(self.env.process(link.recv()))
+            print("XXX", self.rid, len(events))
+
+            # could get multiple completed triggers at this yield
+            conds = yield self.env.any_of(events)
+            for event in conds.values():
+                print(f"{self.env.now}:  R{self.rid}  {event}")
+                if event.enum == EType.ApiBatch:
+                    self.protocol.handle_api_batch(event.value)
+                elif event.enum == EType.DiskSaved:
+                    self.protocol.handle_disk_saved(event.value)
+                elif event.enum == EType.NetRecved:
+                    self.protocol.handle_net_recved(event.info, event.value)
+                else:
+                    raise RuntimeError(f"unrecognized event type: {event}")
+
+    def connect(self, client):
+        return self.extl_api.connect(client)
+
+
+class Cluster:
+    def __init__(
+        self,
+        env,
+        num_replicas,
+        api_ltvb,
+        disk_ltv_map,
+        net_ltv_map,
+        protocol,
+        **protocol_args,
+    ):
+        self.env = env
+        self.replicas = [
+            Replica(
+                env,
+                rid,
+                api_ltvb,
+                disk_ltv_map[rid],
+                protocol,
+                **protocol_args,
+            )
+            for rid in range(num_replicas)
+        ]
+        self.leader = self.replicas[0]
+
+        for rid, replica in enumerate(self.replicas):
+            for peerid in range(rid + 1, num_replicas):
+                peer = self.replicas[peerid]
+                replica.add_peer(peer, net_ltv_map[(rid, peerid)])
+
+    def launch(self):
+        for replica in self.replicas:
+            self.env.process(replica.run())
+
+    def connect(self, client):
+        return self.leader.connect(client)
+
+
+#############
+# Protocols #
+#############
+
+
+class Protocol:
+    def __init__(self, replica):
+        self.replica = replica
+
+
+class MultiPaxos(Protocol):
+    def __init__(self, replica, quorum_size):
+        super().__init__(replica)
+
+        self.quorum_size = quorum_size
+        self.insts = []
+
+    class Instance:
+        def __init__(self, batch=None):
+            self.batch = batch
+            self.num_replies = 0
+            self.from_peer = 0
+            self.client_acked = False
+
+    class AcceptMsg(Data):
+        def __init__(self, slot, batch):
+            super().__init__(f"a-{slot}", batch.size + 8)
+            self.batch = batch
+
+    class AcceptReply(Data):
+        def __init__(self, slot):
+            super().__init__(f"r-{slot}", 8)
+
+    def handle_api_batch(self, batch):
+        self.insts.append(self.Instance(batch))
+        slot = len(self.insts) - 1
+
+        for link in self.replica.send_links.values():
+            link.send(self.AcceptMsg(slot, batch))
+
+        self.replica.disk_dev.write(self.AcceptMsg(slot, batch))
+
+    def handle_disk_saved(self, mark):
+        if not mark.startswith("a-"):
+            raise RuntimeError(f"unrecognized ent mark: {mark}")
+
+        slot = int(mark[2:])
+        assert slot < len(self.insts)
+        self.insts[slot].num_replies += 1
+
+        if (
+            not self.insts[slot].client_acked
+            and self.insts[slot].num_replies >= self.quorum_size
+        ):
+            self.ack_client_reqs(slot)
+
+    def handle_net_recved(self, peer, msg):
+        if msg.mark.startswith("a-"):
+            slot = int(msg.mark[2:])
+            while slot >= len(self.insts):
+                self.insts.append(self.Instance())
+            self.insts[slot].from_peer = peer
+            self.insts[slot].batch = msg.batch
+
+            self.replica.send_links[peer].send(self.AcceptReply(slot))
+
+        elif msg.mark.startswith("r-"):
+            print("!!!", slot)
+            slot = int(msg.mark[2:])
+            assert slot < len(self.insts)
+            self.insts[slot].num_replies += 1
+
+            if (
+                not self.insts[slot].client_acked
+                and self.insts[slot].num_replies >= self.quorum_size
+            ):
+                self.ack_client_reqs(slot)
+
+        else:
+            raise RuntimeError(f"unrecognized msg mark: {msg.mark}")
+
+    def ack_client_reqs(self, slot):
+        assert not self.insts[slot].client_acked
+
+        for req in self.insts[slot].batch.vec:
+            self.replica.extl_api.ack(req.cid, req.mark)
+
+        self.insts[slot].client_acked = True
+
+
+##########
+# Client #
+##########
+
+
+class Stats:
+    def __init__(self, env):
+        self.env = env
+        self.total_cnt = 0
+        self.req_times = dict()
+        self.ack_times = dict()
+
+    def add_req(self, mark):
+        assert mark not in self.req_times
+        self.req_times[mark] = self.env.now
+
+    def add_ack(self, mark):
+        assert mark in self.req_times
+        assert mark not in self.ack_times
+        self.total_cnt += 1
+        self.ack_times[mark] = self.env.now
+        print("!!!", self.env.now)
+
+    def clear(self):
+        for mark in self.ack_times:
+            del self.req_times[mark]
+        self.ack_times = dict()
+
+
+class Client:
+    def __init__(self, env, cluster, cid, freq, vsize):
+        self.env = env
+        self.cid = cid
+        self.service = cluster
+        self.req_link, self.ack_link = self.service.connect(self)
+
+        self.gap = 1.0 / freq
+        self.vsize = vsize
+
+        self.mark = 0
+        self.tick = simpy.Container(env, capacity=1)
+        self.stats = Stats(env)
+
+        self.env.process(self.ticker())
+
+    def ticker(self):
+        while True:
+            yield self.env.timeout(self.gap)
+            if self.tick.level == 0:
+                self.tick.put(1)
+
+    def new_req(self):
+        yield self.tick.get(1)
+        self.mark += 1
+        return SendNewReq(self.mark)
+
+    def driver(self):
+        while True:
+            events = [
+                self.env.process(self.new_req()),
+                self.env.process(self.ack_link.recv()),
+            ]
+
+            # could get multiple completed triggers at this yield
+            conds = yield self.env.any_of(events)
+            for event in conds.values():
+                print(f"{self.env.now}:  C{self.cid}  {event}")
+                if event.enum == EType.SendNewReq:
+                    mark = event.value
+                    self.req_link.send(Req(self.cid, mark, self.vsize))
+                    self.stats.add_req(mark)
+                elif event.enum == EType.NetRecved:
+                    mark = event.value
+                    self.stats.add_ack(mark)
+                else:
+                    raise RuntimeError(f"unrecognized event type: {event}")
+
+    def start(self):
+        self.env.process(self.driver())
+
+
+#################
+# Main entrance #
+#################
+
+
+if __name__ == "__main__":
+    num_replicas = 5
+    api_ltvb = (1, 1, 0, 3)
+    disk_ltv_map = {rid: (1, 1, 0) for rid in range(num_replicas)}
+    net_ltv_map = dict()
+    for rid in range(num_replicas):
+        for peerid in range(rid + 1, num_replicas):
+            net_ltv_map[(rid, peerid)] = (1, 1, 0)
+    freq = 1
+    vsize = 1
+
+    env = simpy.Environment()
+
+    cluster = Cluster(
+        env,
+        5,
+        api_ltvb,
+        disk_ltv_map,
+        net_ltv_map,
+        MultiPaxos,
+        quorum_size=3,
+    )
+    cluster.launch()
+
+    client = Client(env, cluster, 2957, freq, vsize)
+    client.start()
+
+    env.run(until=60)

From 04b5c9619069bef66efb423b3afd0af51d136ab6 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sat, 30 Sep 2023 21:22:10 -0500
Subject: [PATCH 69/89] staging progress on perf sim

---
 models/perf_simulation.py | 72 ++++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 28 deletions(-)

diff --git a/models/perf_simulation.py b/models/perf_simulation.py
index 6d5f29b2..f4ec0436 100644
--- a/models/perf_simulation.py
+++ b/models/perf_simulation.py
@@ -22,6 +22,12 @@ def __init__(self, cid, mark, size):
         super().__init__(mark, size)
 
 
+class Ack(Data):
+    def __init__(self, cid, mark):
+        self.cid = cid
+        super().__init__(mark, 8)
+
+
 class Batch(Data):
     def __init__(self, mark, vec):
         self.vec = vec
@@ -97,13 +103,9 @@ def __init__(self, env, l, t, v, src, dst):
         super().__init__(env, l, t, v)
 
     def send(self, msg):
-        if self.src == 1 and self.dst == 0:
-            print("!!!", msg)
         self.env.process(self.delay(msg))
 
     def recv(self):
-        if self.src == 1 and self.dst == 0:
-            print("???", self.env.now, self.pipe.items)
         msg = yield self.pipe.get()
         return NetRecved(self.src, msg)
 
@@ -172,7 +174,7 @@ def batch(self):
     def ack(self, cid, mark):
         if cid not in self.ack_links:
             raise RuntimeError(f"cid {cid} not in connected")
-        self.ack_links[cid].send(mark)
+        self.ack_links[cid].send(Ack(cid, mark))
 
 
 #####################
@@ -207,25 +209,36 @@ def add_peer(self, peer, net_ltv):
         peer.recv_links[self.rid] = s2p_link
 
     def run(self):
-        while True:
-            events = [
-                self.env.process(self.extl_api.batch()),
-                self.env.process(self.disk_dev.saved()),
-            ]
-            for link in self.recv_links.values():
-                events.append(self.env.process(link.recv()))
-            print("XXX", self.rid, len(events))
+        events = {
+            "api_batch": self.env.process(self.extl_api.batch()),
+            "disk_saved": self.env.process(self.disk_dev.saved()),
+        }
+        for peer, link in self.recv_links.items():
+            events[("net_recved", peer)] = self.env.process(link.recv())
 
+        while True:
             # could get multiple completed triggers at this yield
-            conds = yield self.env.any_of(events)
+            conds = yield self.env.any_of(events.values())
             for event in conds.values():
-                print(f"{self.env.now}:  R{self.rid}  {event}")
+                # print(f"{self.env.now}:  R{self.rid}  {event}")
+
                 if event.enum == EType.ApiBatch:
-                    self.protocol.handle_api_batch(event.value)
+                    batch = event.value
+                    self.protocol.handle_api_batch(batch)
+                    events["api_batch"] = self.env.process(self.extl_api.batch())
+
                 elif event.enum == EType.DiskSaved:
-                    self.protocol.handle_disk_saved(event.value)
+                    mark = event.value
+                    self.protocol.handle_disk_saved(mark)
+                    events["disk_saved"] = self.env.process(self.disk_dev.saved())
+
                 elif event.enum == EType.NetRecved:
-                    self.protocol.handle_net_recved(event.info, event.value)
+                    peer, msg = event.info, event.value
+                    self.protocol.handle_net_recved(peer, msg)
+                    events[("net_recved", peer)] = self.env.process(
+                        self.recv_links[peer].recv()
+                    )
+
                 else:
                     raise RuntimeError(f"unrecognized event type: {event}")
 
@@ -338,7 +351,6 @@ def handle_net_recved(self, peer, msg):
             self.replica.send_links[peer].send(self.AcceptReply(slot))
 
         elif msg.mark.startswith("r-"):
-            print("!!!", slot)
             slot = int(msg.mark[2:])
             assert slot < len(self.insts)
             self.insts[slot].num_replies += 1
@@ -382,7 +394,6 @@ def add_ack(self, mark):
         assert mark not in self.ack_times
         self.total_cnt += 1
         self.ack_times[mark] = self.env.now
-        print("!!!", self.env.now)
 
     def clear(self):
         for mark in self.ack_times:
@@ -418,23 +429,28 @@ def new_req(self):
         return SendNewReq(self.mark)
 
     def driver(self):
-        while True:
-            events = [
-                self.env.process(self.new_req()),
-                self.env.process(self.ack_link.recv()),
-            ]
+        events = {
+            "req": self.env.process(self.new_req()),
+            "ack": self.env.process(self.ack_link.recv()),
+        }
 
+        while True:
             # could get multiple completed triggers at this yield
-            conds = yield self.env.any_of(events)
+            conds = yield self.env.any_of(events.values())
             for event in conds.values():
-                print(f"{self.env.now}:  C{self.cid}  {event}")
+                # print(f"{self.env.now}:  C{self.cid}  {event}")
+
                 if event.enum == EType.SendNewReq:
                     mark = event.value
                     self.req_link.send(Req(self.cid, mark, self.vsize))
                     self.stats.add_req(mark)
+                    events["req"] = self.env.process(self.new_req())
+
                 elif event.enum == EType.NetRecved:
-                    mark = event.value
+                    mark = event.value.mark
                     self.stats.add_ack(mark)
+                    events["ack"] = self.env.process(self.ack_link.recv())
+
                 else:
                     raise RuntimeError(f"unrecognized event type: {event}")
 

From 8f32971e8c7acd20293b3bf19f2e169910e9e95f Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sat, 30 Sep 2023 22:01:09 -0500
Subject: [PATCH 70/89] staging progress on perf sim

---
 models/perf_simulation.py | 67 ++++++++++++++++++++++++++++-----------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/models/perf_simulation.py b/models/perf_simulation.py
index f4ec0436..def8d3d9 100644
--- a/models/perf_simulation.py
+++ b/models/perf_simulation.py
@@ -302,10 +302,10 @@ def __init__(self, replica, quorum_size):
         self.insts = []
 
     class Instance:
-        def __init__(self, batch=None):
-            self.batch = batch
+        def __init__(self):
+            self.batch = None
             self.num_replies = 0
-            self.from_peer = 0
+            self.from_peer = -1
             self.client_acked = False
 
     class AcceptMsg(Data):
@@ -318,8 +318,9 @@ def __init__(self, slot):
             super().__init__(f"r-{slot}", 8)
 
     def handle_api_batch(self, batch):
-        self.insts.append(self.Instance(batch))
+        self.insts.append(self.Instance())
         slot = len(self.insts) - 1
+        self.insts[slot].batch = batch
 
         for link in self.replica.send_links.values():
             link.send(self.AcceptMsg(slot, batch))
@@ -329,28 +330,38 @@ def handle_api_batch(self, batch):
     def handle_disk_saved(self, mark):
         if not mark.startswith("a-"):
             raise RuntimeError(f"unrecognized ent mark: {mark}")
-
         slot = int(mark[2:])
         assert slot < len(self.insts)
-        self.insts[slot].num_replies += 1
 
-        if (
-            not self.insts[slot].client_acked
-            and self.insts[slot].num_replies >= self.quorum_size
-        ):
-            self.ack_client_reqs(slot)
+        if self.insts[slot].from_peer < 0:
+            # disk save on leader
+            self.insts[slot].num_replies += 1
+
+            if (
+                not self.insts[slot].client_acked
+                and self.insts[slot].num_replies >= self.quorum_size
+            ):
+                self.ack_client_reqs(slot)
+
+        else:
+            # disk save on follower
+            self.replica.send_links[self.insts[slot].from_peer].send(
+                self.AcceptReply(slot)
+            )
 
     def handle_net_recved(self, peer, msg):
         if msg.mark.startswith("a-"):
+            # net recv on follower
             slot = int(msg.mark[2:])
             while slot >= len(self.insts):
                 self.insts.append(self.Instance())
             self.insts[slot].from_peer = peer
             self.insts[slot].batch = msg.batch
 
-            self.replica.send_links[peer].send(self.AcceptReply(slot))
+            self.replica.disk_dev.write(self.AcceptMsg(slot, msg.batch))
 
         elif msg.mark.startswith("r-"):
+            # net recv on leader
             slot = int(msg.mark[2:])
             assert slot < len(self.insts)
             self.insts[slot].num_replies += 1
@@ -381,20 +392,28 @@ def ack_client_reqs(self, slot):
 class Stats:
     def __init__(self, env):
         self.env = env
-        self.total_cnt = 0
+        self.total_sent = 0
+        self.total_acks = 0
         self.req_times = dict()
         self.ack_times = dict()
 
     def add_req(self, mark):
         assert mark not in self.req_times
+        self.total_sent += 1
         self.req_times[mark] = self.env.now
 
     def add_ack(self, mark):
         assert mark in self.req_times
         assert mark not in self.ack_times
-        self.total_cnt += 1
+        self.total_acks += 1
         self.ack_times[mark] = self.env.now
 
+    def display(self, chunk_time):
+        lats = [self.ack_times[m] - self.req_times[m] for m in self.ack_times]
+        avg_tput = len(lats) / chunk_time
+        avg_lat = sum(lats) / len(lats) if len(lats) > 0 else 0.0
+        return f"{avg_tput:>9.2f}  {avg_lat:>9.2f}  {len(lats):>7d}  {self.total_acks:>8d} / {self.total_sent:<8d}"
+
     def clear(self):
         for mark in self.ack_times:
             del self.req_times[mark]
@@ -402,7 +421,7 @@ def clear(self):
 
 
 class Client:
-    def __init__(self, env, cluster, cid, freq, vsize):
+    def __init__(self, env, cluster, cid, freq, vsize, chunk_time):
         self.env = env
         self.cid = cid
         self.service = cluster
@@ -413,7 +432,10 @@ def __init__(self, env, cluster, cid, freq, vsize):
 
         self.mark = 0
         self.tick = simpy.Container(env, capacity=1)
+
         self.stats = Stats(env)
+        self.last_print = 0
+        self.chunk_time = chunk_time
 
         self.env.process(self.ticker())
 
@@ -428,12 +450,15 @@ def new_req(self):
         self.mark += 1
         return SendNewReq(self.mark)
 
-    def driver(self):
+    def loop(self):
         events = {
             "req": self.env.process(self.new_req()),
             "ack": self.env.process(self.ack_link.recv()),
         }
 
+        print(
+            f"{'Time':>5s}:  {'Tput':>9s}  {'Lat':>9s}  {'Chunk':>7s}  {'Reply':>8s} / {'Total':<8s}"
+        )
         while True:
             # could get multiple completed triggers at this yield
             conds = yield self.env.any_of(events.values())
@@ -454,8 +479,14 @@ def driver(self):
                 else:
                     raise RuntimeError(f"unrecognized event type: {event}")
 
+            # print chunk-average stats
+            if self.env.now - self.last_print > self.chunk_time:
+                print(f"{self.env.now:>5.1f}:  {self.stats.display(self.chunk_time)}")
+                self.stats.clear()
+                self.last_print = self.env.now
+
     def start(self):
-        self.env.process(self.driver())
+        self.env.process(self.loop())
 
 
 #################
@@ -487,7 +518,7 @@ def start(self):
     )
     cluster.launch()
 
-    client = Client(env, cluster, 2957, freq, vsize)
+    client = Client(env, cluster, 2957, freq, vsize, 10)
     client.start()
 
     env.run(until=60)

From a6623d2afbc00c86d55738e5173a61e5ad58f4ba Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Mon, 2 Oct 2023 00:13:27 -0500
Subject: [PATCH 71/89] staging progress on perf sim

---
 README.md                  |   9 +-
 models/perf_simulation.py  | 668 ++++++++++++++++++++++++++++++-------
 models/plot_sim_results.py | 105 ++++++
 3 files changed, 651 insertions(+), 131 deletions(-)
 create mode 100644 models/plot_sim_results.py

diff --git a/README.md b/README.md
index 6a6224ce..fb43e7d9 100644
--- a/README.md
+++ b/README.md
@@ -155,15 +155,20 @@ Complete cluster management and benchmarking scripts are available in another re
   - [ ] specialize read-only commands?
   - [ ] separate commit vs. exec responses?
   - [ ] membership discovery & view changes
+- [x] implementation of RS-Paxos
 - [ ] implementation of Raft
-- [ ] implementation of Crossword prototype
+- [ ] implementation of CRaft
+- [x] implementation of Crossword prototype
   - [x] fault recovery reads
   - [ ] follower gossiping
+  - [ ] fall-back mechanism
+  - [ ] workload adaptiveness
+  - [ ] unbalanced assignment
 - [x] client-side utilities
   - [x] REPL-style client
   - [x] random benchmarking client
   - [x] testing client
-  - [ ] benchmarking with YCSB input
+  - [ ] YCSB-driven client
 - [ ] better README & documentation
 
 ---
diff --git a/models/perf_simulation.py b/models/perf_simulation.py
index def8d3d9..bfb6734a 100644
--- a/models/perf_simulation.py
+++ b/models/perf_simulation.py
@@ -1,5 +1,15 @@
 import simpy  # type: ignore
-from enum import Enum  # type: ignore
+from enum import Enum
+import random
+import statistics
+import math
+import pickle
+
+import matplotlib
+
+matplotlib.use("Agg")
+
+import matplotlib.pyplot as plt
 
 
 ##############
@@ -28,11 +38,16 @@ def __init__(self, cid, mark):
         super().__init__(mark, 8)
 
 
-class Batch(Data):
-    def __init__(self, mark, vec):
-        self.vec = vec
-        size = sum((data.size for data in vec))
-        super().__init__(mark, size)
+class Codeword(Data):
+    def __init__(self, req, n, m, flags):
+        assert len(flags) > 0
+        assert len(flags) <= n
+        self.req = req
+        self.m = m
+        self.n = n
+        self.flags = flags
+        shard_size = req.size / m
+        super().__init__(req.mark, shard_size * len(flags))
 
 
 ###############
@@ -43,7 +58,7 @@ def __init__(self, mark, vec):
 class EType(Enum):
     NetRecved = 1
     DiskSaved = 2
-    ApiBatch = 3
+    ApiGotReq = 3
     SendNewReq = 4
 
 
@@ -67,9 +82,9 @@ def __init__(self, mark):
         super().__init__(EType.DiskSaved, None, mark)
 
 
-class ApiBatch(Event):
-    def __init__(self, batch):
-        super().__init__(EType.ApiBatch, None, batch)
+class ApiGotReq(Event):
+    def __init__(self, cid, req):
+        super().__init__(EType.ApiGotReq, cid, req)
 
 
 class SendNewReq(Event):
@@ -83,24 +98,32 @@ def __init__(self, mark):
 
 
 class Device:
-    def __init__(self, env, l, t, v):
+    def __init__(self, env, l, t, lv, tv):
         self.env = env
-        self.l = l
-        self.t = t
-        self.v = v  # TODO: use this
+        self.l = l  # latency factor in ms
+        self.t = t  # ms to transfer 1 MB
+        self.lv = lv  # max variation multiplier for l
+        self.tv = tv  # max variation multiplier for t
         self.pipe = simpy.Store(env)
 
+        if self.lv < 1:
+            raise RuntimeError(f"invalid variation ratio {self.lv}")
+        if self.tv < 1:
+            raise RuntimeError(f"invalid variation ratio {self.tv}")
+
     def delay(self, data):
-        delay = self.l + self.t * data.size
+        l = self.l * random.uniform(1, self.lv)
+        t = self.t * random.uniform(1, self.tv)
+        delay = l + t * (data.size / 1000000.0)
         yield self.env.timeout(delay)
         self.pipe.put(data)
 
 
 class NetLink(Device):
-    def __init__(self, env, l, t, v, src, dst):
+    def __init__(self, env, l, t, lv, tv, src, dst):
         self.src = src
         self.dst = dst
-        super().__init__(env, l, t, v)
+        super().__init__(env, l, t, lv, tv)
 
     def send(self, msg):
         self.env.process(self.delay(msg))
@@ -111,9 +134,9 @@ def recv(self):
 
 
 class DiskDev(Device):
-    def __init__(self, env, l, t, v, rid):
+    def __init__(self, env, l, t, lv, tv, rid):
         self.rid = rid
-        super().__init__(env, l, t, v)
+        super().__init__(env, l, t, lv, tv)
 
     def write(self, ent):
         self.env.process(self.delay(ent))
@@ -124,52 +147,32 @@ def saved(self):
 
 
 class ExtlApi:
-    def __init__(self, env, l, t, v, b, rid):
+    def __init__(self, env, l, t, lv, tv, rid):
         self.env = env
+        self.rid = rid
         self.l = l
         self.t = t
-        self.v = v
-        self.b = b
-
-        self.rid = rid
+        self.lv = lv
+        self.tv = tv
         self.req_links = dict()
         self.ack_links = dict()
 
-        self.mark = 0
-        self.tick = simpy.Container(env, capacity=1)
-
-        self.env.process(self.ticker())
-
     def connect(self, client):
-        req_link = NetLink(self.env, self.l, self.t, self.v, client.cid, self.rid)
-        ack_link = NetLink(self.env, self.l, self.t, self.v, self.rid, client.cid)
+        req_link = NetLink(
+            self.env, self.l, self.t, self.lv, self.tv, client.cid, self.rid
+        )
+        ack_link = NetLink(
+            self.env, self.l, self.t, self.lv, self.tv, self.rid, client.cid
+        )
         self.req_links[client.cid] = req_link
         self.ack_links[client.cid] = ack_link
         return (req_link, ack_link)
 
-    def ticker(self):
-        while True:
-            yield self.env.timeout(self.b)
-            if self.tick.level == 0:
-                self.tick.put(1)
-
-    def batch(self):
-        while True:
-            yield self.tick.get(1)
-
-            reqs = []
-            for link in self.req_links.values():
-                if len(link.pipe.items) > 0:
-                    reqs += link.pipe.items
-                    link.pipe.items = []
-
-            # do not return if no reqs available at this tick
-            if len(reqs) == 0:
-                continue
-
-            batch = Batch(self.mark, reqs)
-            self.mark += 1
-            return ApiBatch(batch)
+    def req(self):
+        # NOTE: hardcode assuming only one client connected
+        event = yield self.env.process(self.req_links[2957].recv())
+        req = event.value
+        return ApiGotReq(2957, req)
 
     def ack(self, cid, mark):
         if cid not in self.ack_links:
@@ -183,13 +186,15 @@ def ack(self, cid, mark):
 
 
 class Replica:
-    def __init__(self, env, rid, api_ltvb, disk_ltv, protocol, **protocol_args):
+    def __init__(self, env, rid, api_ltv, disk_ltv, protocol, **protocol_args):
         self.env = env
         self.rid = rid
         self.extl_api = ExtlApi(
-            env, api_ltvb[0], api_ltvb[1], api_ltvb[2], api_ltvb[3], rid
+            env, api_ltv[0], api_ltv[1], api_ltv[2], api_ltv[3], rid
+        )
+        self.disk_dev = DiskDev(
+            env, disk_ltv[0], disk_ltv[1], disk_ltv[2], disk_ltv[3], rid
         )
-        self.disk_dev = DiskDev(env, disk_ltv[0], disk_ltv[1], disk_ltv[2], rid)
         self.send_links = dict()
         self.recv_links = dict()
 
@@ -198,10 +203,10 @@ def __init__(self, env, rid, api_ltvb, disk_ltv, protocol, **protocol_args):
 
     def add_peer(self, peer, net_ltv):
         s2p_link = NetLink(
-            self.env, net_ltv[0], net_ltv[1], net_ltv[2], self.rid, peer.rid
+            self.env, net_ltv[0], net_ltv[1], net_ltv[2], net_ltv[3], self.rid, peer.rid
         )
         p2s_link = NetLink(
-            self.env, net_ltv[0], net_ltv[1], net_ltv[2], peer.rid, self.rid
+            self.env, net_ltv[0], net_ltv[1], net_ltv[2], net_ltv[3], peer.rid, self.rid
         )
         self.send_links[peer.rid] = s2p_link
         self.recv_links[peer.rid] = p2s_link
@@ -210,31 +215,34 @@ def add_peer(self, peer, net_ltv):
 
     def run(self):
         events = {
-            "api_batch": self.env.process(self.extl_api.batch()),
             "disk_saved": self.env.process(self.disk_dev.saved()),
         }
         for peer, link in self.recv_links.items():
             events[("net_recved", peer)] = self.env.process(link.recv())
 
+        # NOTE: hardcoding to have non-leader not do api_got_req
+        if self.rid == 0:
+            events["api_got_req"] = self.env.process(self.extl_api.req())
+
         while True:
             # could get multiple completed triggers at this yield
             conds = yield self.env.any_of(events.values())
             for event in conds.values():
                 # print(f"{self.env.now}:  R{self.rid}  {event}")
 
-                if event.enum == EType.ApiBatch:
-                    batch = event.value
-                    self.protocol.handle_api_batch(batch)
-                    events["api_batch"] = self.env.process(self.extl_api.batch())
+                if event.enum == EType.ApiGotReq:
+                    req = event.value
+                    yield self.env.process(self.protocol.handle_api_got_req(req))
+                    events["api_got_req"] = self.env.process(self.extl_api.req())
 
                 elif event.enum == EType.DiskSaved:
                     mark = event.value
-                    self.protocol.handle_disk_saved(mark)
+                    yield self.env.process(self.protocol.handle_disk_saved(mark))
                     events["disk_saved"] = self.env.process(self.disk_dev.saved())
 
                 elif event.enum == EType.NetRecved:
                     peer, msg = event.info, event.value
-                    self.protocol.handle_net_recved(peer, msg)
+                    yield self.env.process(self.protocol.handle_net_recved(peer, msg))
                     events[("net_recved", peer)] = self.env.process(
                         self.recv_links[peer].recv()
                     )
@@ -251,7 +259,7 @@ def __init__(
         self,
         env,
         num_replicas,
-        api_ltvb,
+        api_ltv,
         disk_ltv_map,
         net_ltv_map,
         protocol,
@@ -262,7 +270,7 @@ def __init__(
             Replica(
                 env,
                 rid,
-                api_ltvb,
+                api_ltv,
                 disk_ltv_map[rid],
                 protocol,
                 **protocol_args,
@@ -295,37 +303,44 @@ def __init__(self, replica):
 
 
 class MultiPaxos(Protocol):
-    def __init__(self, replica, quorum_size):
+    def __init__(self, replica, cluster_size):
         super().__init__(replica)
 
-        self.quorum_size = quorum_size
+        self.q = cluster_size // 2 + 1
+
         self.insts = []
 
+    @classmethod
+    def name_str(cls, cluster_size):
+        return "MultiPaxos/Raft"
+
     class Instance:
         def __init__(self):
-            self.batch = None
+            self.req = None
             self.num_replies = 0
             self.from_peer = -1
             self.client_acked = False
 
     class AcceptMsg(Data):
-        def __init__(self, slot, batch):
-            super().__init__(f"a-{slot}", batch.size + 8)
-            self.batch = batch
+        def __init__(self, slot, req):
+            super().__init__(f"a-{slot}", req.size + 8)
+            self.req = req
 
     class AcceptReply(Data):
         def __init__(self, slot):
             super().__init__(f"r-{slot}", 8)
 
-    def handle_api_batch(self, batch):
+    def handle_api_got_req(self, req):
         self.insts.append(self.Instance())
         slot = len(self.insts) - 1
-        self.insts[slot].batch = batch
+        self.insts[slot].req = req
 
         for link in self.replica.send_links.values():
-            link.send(self.AcceptMsg(slot, batch))
+            link.send(self.AcceptMsg(slot, req))
+
+        self.replica.disk_dev.write(self.AcceptMsg(slot, req))
 
-        self.replica.disk_dev.write(self.AcceptMsg(slot, batch))
+        yield from []
 
     def handle_disk_saved(self, mark):
         if not mark.startswith("a-"):
@@ -339,7 +354,7 @@ def handle_disk_saved(self, mark):
 
             if (
                 not self.insts[slot].client_acked
-                and self.insts[slot].num_replies >= self.quorum_size
+                and self.insts[slot].num_replies >= self.q
             ):
                 self.ack_client_reqs(slot)
 
@@ -349,6 +364,8 @@ def handle_disk_saved(self, mark):
                 self.AcceptReply(slot)
             )
 
+        yield from []
+
     def handle_net_recved(self, peer, msg):
         if msg.mark.startswith("a-"):
             # net recv on follower
@@ -356,9 +373,9 @@ def handle_net_recved(self, peer, msg):
             while slot >= len(self.insts):
                 self.insts.append(self.Instance())
             self.insts[slot].from_peer = peer
-            self.insts[slot].batch = msg.batch
+            self.insts[slot].req = msg.req
 
-            self.replica.disk_dev.write(self.AcceptMsg(slot, msg.batch))
+            self.replica.disk_dev.write(self.AcceptMsg(slot, msg.req))
 
         elif msg.mark.startswith("r-"):
             # net recv on leader
@@ -368,19 +385,312 @@ def handle_net_recved(self, peer, msg):
 
             if (
                 not self.insts[slot].client_acked
-                and self.insts[slot].num_replies >= self.quorum_size
+                and self.insts[slot].num_replies >= self.q
             ):
                 self.ack_client_reqs(slot)
 
         else:
             raise RuntimeError(f"unrecognized msg mark: {msg.mark}")
 
+        yield from []
+
     def ack_client_reqs(self, slot):
         assert not self.insts[slot].client_acked
+        req = self.insts[slot].req
+        self.replica.extl_api.ack(req.cid, req.mark)
+        self.insts[slot].client_acked = True
+
+
+class RSPaxos(Protocol):
+    def __init__(
+        self,
+        replica,
+        cluster_size,
+        same_liveness,
+        comp_delay=0,
+    ):
+        super().__init__(replica)
+
+        self.cluster_size = cluster_size
+        self.comp_delay = comp_delay
+
+        self.m = cluster_size // 2 + 1
+        if same_liveness:
+            self.q = cluster_size
+            self.f = cluster_size - self.m
+        else:
+            self.q = math.ceil((cluster_size + self.m) // 2)
+            self.f = self.q - self.m
+
+        self.insts = []
+
+    @classmethod
+    def name_str(cls, cluster_size, same_liveness, comp_delay=0):
+        if same_liveness:
+            return f"RS-Paxos/CRaft (f-forced)"
+        else:
+            return f"RS-Paxos/CRaft (original)"
+
+    class Instance:
+        def __init__(self):
+            self.req = None
+            self.num_replies = 0
+            self.from_peer = -1
+            self.client_acked = False
+
+    class AcceptMsg(Data):
+        def __init__(self, slot, shard):
+            super().__init__(f"a-{slot}", shard.size + 8)
+            self.shard = shard
+
+    class AcceptReply(Data):
+        def __init__(self, slot):
+            super().__init__(f"r-{slot}", 8)
+
+    def handle_api_got_req(self, req):
+        self.insts.append(self.Instance())
+        slot = len(self.insts) - 1
+        self.insts[slot].req = req
+
+        # add EC computation delay
+        comp_time = self.comp_delay * (float(req.size) / 1000000.0)
+        yield self.replica.env.timeout(comp_time)
+
+        for peer, link in self.replica.send_links.items():
+            codeword = Codeword(req, self.cluster_size, self.m, {peer})
+            link.send(self.AcceptMsg(slot, codeword))
+
+        codeword = Codeword(req, self.cluster_size, self.m, {self.replica.rid})
+        self.replica.disk_dev.write(self.AcceptMsg(slot, codeword))
+
+        yield from []
+
+    def handle_disk_saved(self, mark):
+        if not mark.startswith("a-"):
+            raise RuntimeError(f"unrecognized ent mark: {mark}")
+        slot = int(mark[2:])
+        assert slot < len(self.insts)
 
-        for req in self.insts[slot].batch.vec:
-            self.replica.extl_api.ack(req.cid, req.mark)
+        if self.insts[slot].from_peer < 0:
+            # disk save on leader
+            self.insts[slot].num_replies += 1
+
+            if (
+                not self.insts[slot].client_acked
+                and self.insts[slot].num_replies >= self.q
+            ):
+                self.ack_client_reqs(slot)
+
+        else:
+            # disk save on follower
+            self.replica.send_links[self.insts[slot].from_peer].send(
+                self.AcceptReply(slot)
+            )
 
+        yield from []
+
+    def handle_net_recved(self, peer, msg):
+        if msg.mark.startswith("a-"):
+            # net recv on follower
+            slot = int(msg.mark[2:])
+            while slot >= len(self.insts):
+                self.insts.append(self.Instance())
+            self.insts[slot].from_peer = peer
+            self.insts[slot].req = msg.shard
+
+            self.replica.disk_dev.write(self.AcceptMsg(slot, msg.shard))
+
+        elif msg.mark.startswith("r-"):
+            # net recv on leader
+            slot = int(msg.mark[2:])
+            assert slot < len(self.insts)
+            self.insts[slot].num_replies += 1
+
+            if (
+                not self.insts[slot].client_acked
+                and self.insts[slot].num_replies >= self.q
+            ):
+                self.ack_client_reqs(slot)
+
+        else:
+            raise RuntimeError(f"unrecognized msg mark: {msg.mark}")
+
+        yield from []
+
+    def ack_client_reqs(self, slot):
+        assert not self.insts[slot].client_acked
+        req = self.insts[slot].req
+        self.replica.extl_api.ack(req.cid, req.mark)
+        self.insts[slot].client_acked = True
+
+
+class Crossword(Protocol):
+    def __init__(
+        self,
+        replica,
+        cluster_size,
+        comp_delay=0,
+        shards_per_replica=1,  # NOTE: a "cheating" approach to adaptiveness
+    ):
+        super().__init__(replica)
+
+        self.cluster_size = cluster_size
+        self.comp_delay = comp_delay
+
+        self.m = cluster_size // 2 + 1
+        f = cluster_size - self.m
+        assert shards_per_replica >= 1
+        assert shards_per_replica <= self.m
+        self.l = shards_per_replica
+        self.q = self.m + f + 1 - self.l
+
+        self.insts = []
+
+    @classmethod
+    def name_str(cls, cluster_size, comp_delay=0, shards_per_replica=1):
+        return f"Crossword"
+
+    # def update_perf_number(self, lat):
+    #     self.perf_tries[self.l].append(lat)
+    #     if len(self.perf_tries[self.l]) > 100:
+    #         del self.perf_tries[self.l][0]
+
+    #     if not self.all_tried:
+    #         if len(self.perf_tries[self.l]) >= 100:
+    #             if self.l == 1:
+    #                 self.all_tried = True
+    #             else:
+    #                 self.l -= 1
+
+    # def choose_best_config(self):
+    #     if self.all_tried and not self.ql_picked:
+    #         m = self.cluster_size // 2 + 1
+    #         f = self.cluster_size - m
+
+    #         avg_lats = dict()
+    #         for l, lats in self.perf_tries.items():
+    #             sorted_lats = sorted(lats)[:-10]
+    #             avg_lats[l] = sum(sorted_lats) / len(sorted_lats)
+    #         self.l = min(avg_lats, key=avg_lats.get)
+    #         self.q = m + f - self.l + 1
+
+    #         print(" picked", self.l)
+    #         self.ql_picked = True
+
+    class Instance:
+        def __init__(self):
+            self.req = None
+            self.num_replies = 0
+            self.from_peer = -1
+            self.client_acked = False
+
+    class AcceptMsg(Data):
+        def __init__(self, slot, shard):
+            super().__init__(f"a-{slot}", shard.size + 8)
+            self.shard = shard
+
+    class AcceptReply(Data):
+        def __init__(self, slot):
+            super().__init__(f"r-{slot}", 8)
+
+    def handle_api_got_req(self, req):
+        self.insts.append(self.Instance())
+        slot = len(self.insts) - 1
+        self.insts[slot].req = req
+
+        # add EC computation delay
+        comp_time = self.comp_delay * (float(req.size) / 1000000.0)
+        yield self.replica.env.timeout(comp_time)
+
+        # pick the best config if haven't yet
+        # self.choose_best_config()
+
+        # record this req's starting time
+        # self.curr_reqs[req.mark] = self.replica.env.now
+
+        for peer, link in self.replica.send_links.items():
+            codeword = Codeword(
+                req,
+                self.cluster_size,
+                self.m,
+                {(p % self.cluster_size) for p in range(peer, peer + self.l)},
+            )
+            link.send(self.AcceptMsg(slot, codeword))
+
+        me = self.replica.rid
+        codeword = Codeword(
+            req,
+            self.cluster_size,
+            self.m,
+            {(p % self.cluster_size) for p in range(me, me + self.l)},
+        )
+        self.replica.disk_dev.write(self.AcceptMsg(slot, codeword))
+
+        yield from []
+
+    def handle_disk_saved(self, mark):
+        if not mark.startswith("a-"):
+            raise RuntimeError(f"unrecognized ent mark: {mark}")
+        slot = int(mark[2:])
+        assert slot < len(self.insts)
+
+        if self.insts[slot].from_peer < 0:
+            # disk save on leader
+            self.insts[slot].num_replies += 1
+
+            if (
+                not self.insts[slot].client_acked
+                and self.insts[slot].num_replies >= self.q
+            ):
+                self.ack_client_reqs(slot)
+
+        else:
+            # disk save on follower
+            self.replica.send_links[self.insts[slot].from_peer].send(
+                self.AcceptReply(slot)
+            )
+
+        yield from []
+
+    def handle_net_recved(self, peer, msg):
+        if msg.mark.startswith("a-"):
+            # net recv on follower
+            slot = int(msg.mark[2:])
+            while slot >= len(self.insts):
+                self.insts.append(self.Instance())
+            self.insts[slot].from_peer = peer
+            self.insts[slot].req = msg.shard
+
+            self.replica.disk_dev.write(self.AcceptMsg(slot, msg.shard))
+
+        elif msg.mark.startswith("r-"):
+            # net recv on leader
+            slot = int(msg.mark[2:])
+            assert slot < len(self.insts)
+            self.insts[slot].num_replies += 1
+
+            if (
+                not self.insts[slot].client_acked
+                and self.insts[slot].num_replies >= self.q
+            ):
+                self.ack_client_reqs(slot)
+
+        else:
+            raise RuntimeError(f"unrecognized msg mark: {msg.mark}")
+
+        yield from []
+
+    def ack_client_reqs(self, slot):
+        assert not self.insts[slot].client_acked
+        req = self.insts[slot].req
+
+        # update perf records
+        # assert req.mark in self.curr_reqs
+        # lat = self.replica.env.now - self.curr_reqs[req.mark]
+        # self.update_perf_number(lat)
+        # del self.curr_reqs[req.mark]
+
+        self.replica.extl_api.ack(req.cid, req.mark)
         self.insts[slot].client_acked = True
 
 
@@ -408,11 +718,19 @@ def add_ack(self, mark):
         self.total_acks += 1
         self.ack_times[mark] = self.env.now
 
-    def display(self, chunk_time):
+    def summary(self):
         lats = [self.ack_times[m] - self.req_times[m] for m in self.ack_times]
-        avg_tput = len(lats) / chunk_time
+        lats.sort()
+        assert len(lats) > 100
+
+        chunk_cnt = len(lats)
+        med_lat = lats[len(lats) // 2]
+
+        lats = lats[:-100]
         avg_lat = sum(lats) / len(lats) if len(lats) > 0 else 0.0
-        return f"{avg_tput:>9.2f}  {avg_lat:>9.2f}  {len(lats):>7d}  {self.total_acks:>8d} / {self.total_sent:<8d}"
+        std_lat = statistics.stdev(lats)
+
+        return (med_lat, avg_lat, std_lat, chunk_cnt, self.total_acks, self.total_sent)
 
     def clear(self):
         for mark in self.ack_times:
@@ -421,7 +739,7 @@ def clear(self):
 
 
 class Client:
-    def __init__(self, env, cluster, cid, freq, vsize, chunk_time):
+    def __init__(self, env, cluster, cid, freq, vsize):
         self.env = env
         self.cid = cid
         self.service = cluster
@@ -429,14 +747,11 @@ def __init__(self, env, cluster, cid, freq, vsize, chunk_time):
 
         self.gap = 1.0 / freq
         self.vsize = vsize
+        self.stats = Stats(env)
 
         self.mark = 0
         self.tick = simpy.Container(env, capacity=1)
 
-        self.stats = Stats(env)
-        self.last_print = 0
-        self.chunk_time = chunk_time
-
         self.env.process(self.ticker())
 
     def ticker(self):
@@ -450,15 +765,12 @@ def new_req(self):
         self.mark += 1
         return SendNewReq(self.mark)
 
-    def loop(self):
+    def loop(self, num_reqs=None):
         events = {
             "req": self.env.process(self.new_req()),
             "ack": self.env.process(self.ack_link.recv()),
         }
 
-        print(
-            f"{'Time':>5s}:  {'Tput':>9s}  {'Lat':>9s}  {'Chunk':>7s}  {'Reply':>8s} / {'Total':<8s}"
-        )
         while True:
             # could get multiple completed triggers at this yield
             conds = yield self.env.any_of(events.values())
@@ -469,7 +781,11 @@ def loop(self):
                     mark = event.value
                     self.req_link.send(Req(self.cid, mark, self.vsize))
                     self.stats.add_req(mark)
-                    events["req"] = self.env.process(self.new_req())
+                    # if num_reqs given, only issue this many reqs
+                    if num_reqs is None or self.stats.total_sent < num_reqs:
+                        events["req"] = self.env.process(self.new_req())
+                    else:
+                        del events["req"]
 
                 elif event.enum == EType.NetRecved:
                     mark = event.value.mark
@@ -479,14 +795,14 @@ def loop(self):
                 else:
                     raise RuntimeError(f"unrecognized event type: {event}")
 
-            # print chunk-average stats
-            if self.env.now - self.last_print > self.chunk_time:
-                print(f"{self.env.now:>5.1f}:  {self.stats.display(self.chunk_time)}")
-                self.stats.clear()
-                self.last_print = self.env.now
+            # if num_reqs given, only issue this many reqs
+            if num_reqs is not None and self.stats.total_acks == num_reqs:
+                break
+
+        return self.stats
 
-    def start(self):
-        self.env.process(self.loop())
+    def start(self, num_reqs=None):
+        return self.env.process(self.loop(num_reqs=num_reqs))
 
 
 #################
@@ -494,31 +810,125 @@ def start(self):
 #################
 
 
+class HomoParams:
+    def __init__(self, num_replicas, api_ltv, disk_ltv, net_ltv, vsize):
+        self.num_replicas = num_replicas
+
+        self.api_ltv = api_ltv
+        self.disk_ltv_map = {rid: disk_ltv for rid in range(num_replicas)}
+        self.net_ltv_map = dict()
+        for rid in range(num_replicas):
+            for peerid in range(rid + 1, num_replicas):
+                self.net_ltv_map[(rid, peerid)] = net_ltv
+
+        # NOTE: a "cheating" approach to adaptiveness
+        shards_per_replica = 1
+        if net_ltv[0] >= 10:
+            shards_per_replica = 3
+        elif net_ltv[0] >= 5:
+            if vsize <= 1900 * 1000:
+                shards_per_replica = 3
+            elif vsize <= 2300 * 1000:
+                shards_per_replica = 2
+
+        self.protocol_configs = [
+            (MultiPaxos, {"cluster_size": num_replicas}),
+            (RSPaxos, {"cluster_size": num_replicas, "same_liveness": True}),
+            (RSPaxos, {"cluster_size": num_replicas, "same_liveness": False}),
+            (
+                Crossword,
+                {
+                    "cluster_size": num_replicas,
+                    "shards_per_replica": shards_per_replica,
+                },
+            ),
+        ]
+
+        self.vsize = vsize
+
+
+class ParamsLatBounded(HomoParams):
+    def __init__(self, num_replicas, vsize):
+        api_ltv = (1, 1, 1, 1)
+        disk_ltv = (2, 0.5, 20, 1.5)
+        net_ltv = (10, 2.5, 20, 1.5)
+        super().__init__(num_replicas, api_ltv, disk_ltv, net_ltv, vsize)
+
+
+class ParamsTputBounded(HomoParams):
+    def __init__(self, num_replicas, vsize):
+        api_ltv = (1, 1, 1, 1)
+        disk_ltv = (0.1, 10, 20, 1.5)
+        net_ltv = (0.5, 50, 20, 1.5)
+        super().__init__(num_replicas, api_ltv, disk_ltv, net_ltv, vsize)
+
+
+class ParamsLatTputMix(HomoParams):
+    def __init__(self, num_replicas, vsize):
+        api_ltv = (1, 1, 1, 1)
+        disk_ltv = (1, 5, 20, 1.5)
+        net_ltv = (5, 25, 20, 1.5)
+        super().__init__(num_replicas, api_ltv, disk_ltv, net_ltv, vsize)
+
+
+def simulate(params):
+    results = dict()
+    for protocol, protocol_args in params.protocol_configs:
+        env = simpy.Environment()
+        cluster = Cluster(
+            env,
+            params.num_replicas,
+            params.api_ltv,
+            params.disk_ltv_map,
+            params.net_ltv_map,
+            protocol,
+            **protocol_args,
+        )
+        client = Client(env, cluster, 2957, freq=0.002, vsize=params.vsize)
+
+        cluster.launch()
+        done = client.start(num_reqs=1000)
+        stats = env.run(until=done)
+
+        med_lat, avg_lat, std_lat, _, _, _ = stats.summary()
+        name_str = protocol.name_str(**protocol_args)
+        results[name_str] = (med_lat, avg_lat, std_lat)
+
+    return results
+
+
 if __name__ == "__main__":
-    num_replicas = 5
-    api_ltvb = (1, 1, 0, 3)
-    disk_ltv_map = {rid: (1, 1, 0) for rid in range(num_replicas)}
-    net_ltv_map = dict()
-    for rid in range(num_replicas):
-        for peerid in range(rid + 1, num_replicas):
-            net_ltv_map[(rid, peerid)] = (1, 1, 0)
-    freq = 1
-    vsize = 1
-
-    env = simpy.Environment()
-
-    cluster = Cluster(
-        env,
-        5,
-        api_ltvb,
-        disk_ltv_map,
-        net_ltv_map,
-        MultiPaxos,
-        quorum_size=3,
-    )
-    cluster.launch()
+    random.seed()
+
+    # TODO: real adaptiveness design
+    print("NOTE: adaptiveness hardcoded for 5!")
+
+    # for num_replicas in (3, 5, 7, 9):
+    for num_replicas in (5,):
+        results = {
+            "vsizes": [],
+            "lat_bounded": [],
+            "tput_bounded": [],
+            "lat_tput_mix": [],
+        }
+
+        vsizes = [v * 1000 for v in (2**p for p in range(3, 11))]
+        vsizes += [v * 1000 for v in (100 * i for i in range(1, 51))]
+        vsizes.sort()
 
-    client = Client(env, cluster, 2957, freq, vsize, 10)
-    client.start()
+        for vsize in vsizes:
+            results["vsizes"].append(vsize)
+            results["lat_bounded"].append(
+                simulate(ParamsLatBounded(num_replicas, vsize))
+            )
+            results["tput_bounded"].append(
+                simulate(ParamsTputBounded(num_replicas, vsize))
+            )
+            results["lat_tput_mix"].append(
+                simulate(ParamsLatTputMix(num_replicas, vsize))
+            )
+            print(f"Ran: {num_replicas} {vsize // 1000}")
 
-    env.run(until=60)
+        with open(f"results/sim.x_vsize.r_{num_replicas}.pkl", "wb") as fpkl:
+            pickle.dump(results, fpkl)
+            print(f"Dumped: {num_replicas}")
diff --git a/models/plot_sim_results.py b/models/plot_sim_results.py
new file mode 100644
index 00000000..18e6c6f1
--- /dev/null
+++ b/models/plot_sim_results.py
@@ -0,0 +1,105 @@
+import matplotlib
+
+matplotlib.use("Agg")
+
+import pickle
+import math
+import matplotlib.pyplot as plt
+
+
+def protocol_style(protocol, cluster_size):
+    m = cluster_size // 2 + 1
+    f = cluster_size - m
+    if "MultiPaxos" in protocol:
+        return ("-", "dimgray", "s", f"MultiPaxos/Raft\nf={f}  |Q|={m}  l={m}")
+    elif "RS-Paxos" in protocol:
+        if "forced" in protocol:
+            return (
+                "-",
+                "red",
+                "x",
+                f"RS-Paxos/CRaft (f-forced)\nf={f}  |Q|={cluster_size}  l=1",
+            )
+        else:
+            q = math.ceil((cluster_size + m) // 2)
+            lower_f = q - m
+            return (
+                ":",
+                "orange",
+                "x",
+                f"RS-Paxos/CRaft (original)\nf={lower_f}  |Q|={q}  l=1",
+            )
+    elif "Crossword" in protocol:
+        return ("-", "steelblue", "o", f"Crossword\nf={f}  |Q|,l=adaptive")
+    else:
+        raise RuntimeError(f"unrecognized protocol {protocol}")
+
+
+def params_display(params):
+    if params == "lat_bounded":
+        return "Latency bounded"
+    elif params == "tput_bounded":
+        return "Throughput bounded"
+    elif params == "lat_tput_mix":
+        return "Both moderate"
+    else:
+        raise RuntimeError(f"unrecognized params {params}")
+
+
+def plot_x_vsize(num_replicas, results):
+    matplotlib.rcParams.update(
+        {
+            "figure.figsize": (11, 3),
+            "font.size": 10,
+        }
+    )
+
+    plt.figure()
+
+    xs = list(map(lambda s: s / 1000, results["vsizes"]))
+    protocols = results["lat_bounded"][0].keys()
+
+    for idx, params in enumerate(("lat_bounded", "lat_tput_mix", "tput_bounded")):
+        plt.subplot(131 + idx)
+
+        for protocol in protocols:
+            ys = [r[protocol][0] for r in results[params]]
+            yerrs = [r[protocol][2] for r in results[params]]
+            linestyle, color, marker, label = protocol_style(protocol, num_replicas)
+
+            plt.errorbar(
+                xs,
+                ys,
+                # yerr=yerrs,
+                label=label,
+                linestyle=linestyle,
+                linewidth=2,
+                color=color,
+                # marker=marker,
+                # markersize=3,
+                ecolor="darkgray",
+                elinewidth=1,
+                capsize=2,
+            )
+
+        plt.ylim(0, 420)
+
+        plt.xlabel("Instance size (kB)")
+        plt.ylabel("Response time (ms)")
+
+        title = params_display(params)
+        plt.title(title)
+
+    plt.legend(loc="center left", bbox_to_anchor=(1.1, 0.5), labelspacing=1.2)
+
+    plt.tight_layout()
+
+    plt.savefig(f"results/sim.x_vsize.r_{num_replicas}.png", dpi=300)
+    plt.close()
+
+
+if __name__ == "__main__":
+    for num_replicas in (5,):
+        with open(f"results/sim.x_vsize.r_{num_replicas}.pkl", "rb") as fpkl:
+            results = pickle.load(fpkl)
+            plot_x_vsize(num_replicas, results)

From 9e7ad6be2fd5bd7733f62f26ad88c5aa083c44e4 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Mon, 2 Oct 2023 17:06:18 -0500
Subject: [PATCH 72/89] polish constraint boundary figure

---
 models/plot_cstr_bounds.py | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/models/plot_cstr_bounds.py b/models/plot_cstr_bounds.py
index a9c49cfc..30a6335c 100644
--- a/models/plot_cstr_bounds.py
+++ b/models/plot_cstr_bounds.py
@@ -70,6 +70,14 @@ def plot_cstr_bound(idx, cluster_size):
     ys = [m, m + 1, 2, 1]
     plt.fill(xs, ys, color=fill_color, label="Region of fault-tolerance=f", zorder=0)
 
+    # unused x-axis range
+    if cluster_size < CLUSTER_SIZES[-1]:
+        xs = [n + 0.9, X_TICKS[-1] + 0.35, X_TICKS[-1] + 0.35, n + 0.8]
+        ys = [0.3, 0.3, 0, 0]
+        plt.fill(
+            xs, ys, hatch="///", fill=False, edgecolor=None, linewidth=0, zorder=10
+        )
+
     # latency & throughput optimized arrows
     plt.arrow(
         m + 0.3,
@@ -117,22 +125,27 @@ def plot_cstr_bound(idx, cluster_size):
 
     plt.xlim((0, X_TICKS[-1] + 0.7))
     plt.ylim((0, Y_TICKS[-1] + 2.7))
-    plt.xticks(X_TICKS, list(map(str, X_TICKS)))
+    plt.xticks(X_TICKS[:cluster_size], list(map(str, X_TICKS))[:cluster_size])
     plt.yticks(Y_TICKS, list(map(str, Y_TICKS)))
 
     plt.xlabel("|Quorum|", loc="right")
     plt.ylabel("#Shards\n/replica", loc="top", rotation=0, backgroundcolor="white")
-    ax.xaxis.set_label_coords(1.05, -0.18)
+    if idx < 2:
+        ax.xaxis.set_label_coords(1.05, -0.1)
+    else:
+        ax.xaxis.set_label_coords(1.05, -0.18)
     ax.yaxis.set_label_coords(0.2, 0.8)
 
-    plt.title(
-        f"|Cluster|={n}  f={f}",
-        x=0.3,
-        y=-0.38,
-        fontsize=10,
-        fontweight="bold",
-        backgroundcolor=fill_color,
-    )
+    # plt.title(
+    #     f"|Cluster|={n}  f={f}",
+    #     x=0.5,
+    #     y=-0.48,
+    #     fontsize=11,
+    #     # fontweight="bold",
+    #     # backgroundcolor=fill_color,
+    # )
+    plt.text(2.2, -3.2, f"|Cluster|={n}  f={f}", fontsize=11)
+    plt.text(1, -3.2, "▬", fontsize=11, color=line_color)
 
     return ax
 
@@ -182,7 +195,7 @@ def make_legend_polygon(
         sorted_handles,
         sorted_labels,
         loc="lower center",
-        bbox_to_anchor=(0.5, 0.78),
+        bbox_to_anchor=(0.5, 0.81),
         ncol=len(handles),
         handlelength=1.5,
         handletextpad=0.5,
@@ -214,7 +227,7 @@ def plot_all_cstr_bounds():
     # single legend group on top
     make_legend(fig, handles, labels)
 
-    plt.tight_layout()
+    plt.tight_layout(pad=1.0)
     plt.savefig(f"results/cstr_bounds.png", dpi=300)
 
 

From c48cc82a5b676db4301c23e805b0336003ad2208 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Tue, 3 Oct 2023 18:06:47 -0500
Subject: [PATCH 73/89] fixing commit_bar and exec_bar bugs

---
 src/manager/clusman.rs       |  18 +++---
 src/protocols/crossword.rs   | 107 +++++++++++++++++++++--------------
 src/protocols/multipaxos.rs  |  60 ++++++++++++--------
 src/protocols/rep_nothing.rs |   2 +
 src/protocols/rs_paxos.rs    |  91 +++++++++++++++--------------
 src/protocols/simple_push.rs |   2 +
 6 files changed, 164 insertions(+), 116 deletions(-)

diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index a21ef9c7..89f2700d 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -186,19 +186,12 @@ impl ClusterManager {
                                     protocol);
         }
 
-        // tell it to connect to all existing known servers
+        // gather the list of all existing known servers
         let to_peers: HashMap<ReplicaId, SocketAddr> = self
             .server_info
             .iter()
             .map(|(&server, info)| (server, info.p2p_addr))
             .collect();
-        self.server_reigner.send_ctrl(
-            CtrlMsg::ConnectToPeers {
-                population: self.population,
-                to_peers,
-            },
-            server,
-        )?;
 
         // save new server's info
         self.server_info.insert(
@@ -211,6 +204,15 @@ impl ClusterManager {
                 start_slot: 0,
             },
         );
+
+        // tell it to connect to all other existing known servers
+        self.server_reigner.send_ctrl(
+            CtrlMsg::ConnectToPeers {
+                population: self.population,
+                to_peers,
+            },
+            server,
+        )?;
         Ok(())
     }
 
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 95f044c1..11a76c6a 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -330,6 +330,7 @@ pub struct CrosswordReplica {
 // CrosswordReplica common helpers
 impl CrosswordReplica {
     /// Create an empty null instance.
+    #[inline]
     fn null_instance(&self) -> Result<Instance, SummersetError> {
         Ok(Instance {
             bal: 0,
@@ -352,18 +353,32 @@ impl CrosswordReplica {
         })
     }
 
+    /// Locate the first null slot or append a null instance if no holes exist.
+    fn first_null_slot(&mut self) -> Result<usize, SummersetError> {
+        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
+            if self.insts[s - self.start_slot].status == Status::Null {
+                return Ok(s);
+            }
+        }
+        self.insts.push(self.null_instance()?);
+        Ok(self.start_slot + self.insts.len() - 1)
+    }
+
     /// Compose a unique ballot number from base.
+    #[inline]
     fn make_unique_ballot(&self, base: u64) -> Ballot {
         ((base << 8) | ((self.id + 1) as u64)) as Ballot
     }
 
     /// Compose a unique ballot number greater than the given one.
+    #[inline]
     fn make_greater_ballot(&self, bal: Ballot) -> Ballot {
         self.make_unique_ballot((bal >> 8) + 1)
     }
 
     /// Compose LogActionId from slot index & entry type.
     /// Uses the `Status` enum type to represent differnet entry types.
+    #[inline]
     fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId {
         let type_num = match entry_type {
             Status::Preparing => 1,
@@ -375,6 +390,7 @@ impl CrosswordReplica {
     }
 
     /// Decompose LogActionId into slot index & entry type.
+    #[inline]
     fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) {
         let slot = (log_action_id >> 2) as usize;
         let type_num = log_action_id & ((1 << 2) - 1);
@@ -388,6 +404,7 @@ impl CrosswordReplica {
     }
 
     /// Compose CommandId from slot index & command index within.
+    #[inline]
     fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId {
         assert!(slot <= (u32::MAX as usize));
         assert!(cmd_idx <= (u32::MAX as usize));
@@ -395,6 +412,7 @@ impl CrosswordReplica {
     }
 
     /// Decompose CommandId into slot index & command index within.
+    #[inline]
     fn split_command_id(command_id: CommandId) -> (usize, usize) {
         let slot = (command_id >> 32) as usize;
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
@@ -402,12 +420,17 @@ impl CrosswordReplica {
     }
 
     /// TODO: maybe remove this.
+    #[inline]
     fn shards_for_replica(
+        slot: usize,
         id: ReplicaId,
         population: u8,
         num_shards: u8,
     ) -> Vec<u8> {
-        (id..(id + num_shards)).map(|i| (i % population)).collect()
+        let first: u8 = ((id as usize + slot) % population as usize) as u8;
+        (first..(first + num_shards))
+            .map(|i| (i % population))
+            .collect()
     }
 
     /// TODO: make better impl of this.
@@ -495,33 +518,18 @@ impl CrosswordReplica {
         reqs_cw.compute_parity(Some(&self.rs_coder))?;
 
         // create a new instance in the first null slot (or append a new one
-        // at the end if no holes exist)
-        let mut slot = self.start_slot + self.insts.len();
-        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
-            if self.insts[s - self.start_slot].status == Status::Null {
-                slot = s;
-                break;
-            }
-        }
-        if slot < self.start_slot + self.insts.len() {
-            let old_inst = &mut self.insts[slot - self.start_slot];
-            assert_eq!(old_inst.status, Status::Null);
-            old_inst.reqs_cw = reqs_cw;
-            old_inst.leader_bk = Some(LeaderBookkeeping {
-                prepare_acks: Bitmap::new(self.population, false),
-                prepare_max_bal: 0,
-                accept_acks: HashMap::new(),
-            });
-        } else {
-            let mut new_inst = self.null_instance()?;
-            new_inst.reqs_cw = reqs_cw;
-            new_inst.leader_bk = Some(LeaderBookkeeping {
+        // at the end if no holes exist); fill it up with incoming data
+        let slot = self.first_null_slot()?;
+        {
+            let inst = &mut self.insts[slot - self.start_slot];
+            assert_eq!(inst.status, Status::Null);
+            inst.reqs_cw = reqs_cw;
+            inst.leader_bk = Some(LeaderBookkeeping {
                 prepare_acks: Bitmap::new(self.population, false),
                 prepare_max_bal: 0,
                 accept_acks: HashMap::new(),
             });
-            new_inst.external = true;
-            self.insts.push(new_inst);
+            inst.external = true;
         }
 
         // decide whether we can enter fast path for this instance
@@ -578,6 +586,7 @@ impl CrosswordReplica {
                 Bitmap::from(
                     self.population,
                     Self::shards_for_replica(
+                        slot,
                         self.id,
                         self.population,
                         self.config.shards_per_replica,
@@ -615,6 +624,7 @@ impl CrosswordReplica {
                             Bitmap::from(
                                 self.population,
                                 Self::shards_for_replica(
+                                    slot,
                                     peer,
                                     self.population,
                                     self.config.shards_per_replica,
@@ -732,6 +742,8 @@ impl CrosswordReplica {
                 if inst.status < Status::Committed {
                     break;
                 }
+                let now_slot = self.commit_bar;
+                self.commit_bar += 1;
 
                 if inst.reqs_cw.avail_shards() < self.quorum_cnt {
                     // can't execute if I don't have the complete request batch
@@ -752,7 +764,7 @@ impl CrosswordReplica {
                     for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
                         if let ApiRequest::Req { cmd, .. } = req {
                             self.state_machine.submit_cmd(
-                                Self::make_command_id(self.commit_bar, cmd_idx),
+                                Self::make_command_id(now_slot, cmd_idx),
                                 cmd.clone(),
                             )?;
                         } else {
@@ -760,10 +772,8 @@ impl CrosswordReplica {
                         }
                     }
                     pf_trace!(self.id; "submitted {} exec commands for slot {}",
-                                       reqs.len(), self.commit_bar);
+                                       reqs.len(), now_slot);
                 }
-
-                self.commit_bar += 1;
             }
         }
 
@@ -934,6 +944,7 @@ impl CrosswordReplica {
                     Bitmap::from(
                         self.population,
                         Self::shards_for_replica(
+                            slot,
                             self.id,
                             self.population,
                             self.config.shards_per_replica,
@@ -969,6 +980,7 @@ impl CrosswordReplica {
                                 Bitmap::from(
                                     self.population,
                                     Self::shards_for_replica(
+                                        slot,
                                         peer,
                                         self.population,
                                         self.config.shards_per_replica,
@@ -1078,6 +1090,7 @@ impl CrosswordReplica {
                 Bitmap::from(
                     self.population,
                     Self::shards_for_replica(
+                        slot,
                         peer,
                         self.population,
                         self.config.shards_per_replica,
@@ -1220,7 +1233,6 @@ impl CrosswordReplica {
                            peer, slot, ballot, reqs_cw.avail_shards_map());
         assert!(slot < self.start_slot + self.insts.len());
         assert!(self.insts[slot - self.start_slot].status >= Status::Committed);
-        let num_insts = self.start_slot + self.insts.len();
         let inst = &mut self.insts[slot - self.start_slot];
 
         // if reply not outdated and ballot is up-to-date
@@ -1229,10 +1241,10 @@ impl CrosswordReplica {
             inst.reqs_cw.absorb_other(reqs_cw)?;
 
             // if enough shards have been gathered, can push execution forward
-            if slot == self.commit_bar {
-                while self.commit_bar < num_insts {
-                    let inst =
-                        &mut self.insts[self.commit_bar - self.start_slot];
+            if slot == self.exec_bar {
+                let mut now_slot = self.exec_bar;
+                while now_slot < self.start_slot + self.insts.len() {
+                    let inst = &mut self.insts[now_slot - self.start_slot];
                     if inst.status < Status::Committed
                         || inst.reqs_cw.avail_shards() < self.quorum_cnt
                     {
@@ -1253,10 +1265,7 @@ impl CrosswordReplica {
                         for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
                             if let ApiRequest::Req { cmd, .. } = req {
                                 self.state_machine.submit_cmd(
-                                    Self::make_command_id(
-                                        self.commit_bar,
-                                        cmd_idx,
-                                    ),
+                                    Self::make_command_id(now_slot, cmd_idx),
                                     cmd.clone(),
                                 )?;
                             } else {
@@ -1264,10 +1273,10 @@ impl CrosswordReplica {
                             }
                         }
                         pf_trace!(self.id; "submitted {} exec commands for slot {}",
-                                           reqs.len(), self.commit_bar);
+                                           reqs.len(), now_slot);
                     }
 
-                    self.commit_bar += 1;
+                    now_slot += 1;
                 }
             }
         }
@@ -1777,7 +1786,7 @@ impl CrosswordReplica {
 
             LogEntry::CommitSlot { slot } => {
                 assert!(slot < self.start_slot + self.insts.len());
-                // update instance state
+                // update instance status
                 self.insts[slot - self.start_slot].status = Status::Committed;
                 // submit commands in contiguously committed instance to the
                 // state machine
@@ -1788,6 +1797,8 @@ impl CrosswordReplica {
                         if inst.status < Status::Committed {
                             break;
                         }
+                        // update commit_bar
+                        self.commit_bar += 1;
                         // check number of available shards
                         if inst.reqs_cw.avail_shards() < self.quorum_cnt {
                             // can't execute if I don't have the complete request batch
@@ -1808,9 +1819,9 @@ impl CrosswordReplica {
                                 let _ = self.state_machine.get_result().await?;
                             }
                         }
-                        // update commit_bar and exec_bar
-                        self.commit_bar += 1;
+                        // update instance status and exec_bar
                         self.exec_bar += 1;
+                        inst.status = Status::Executed;
                     }
                 }
             }
@@ -1863,6 +1874,10 @@ impl CrosswordReplica {
             offset_ok: true, ..
         } = log_result
         {
+            if self.log_offset > 0 {
+                pf_info!(self.id; "recovered from wal log: commit {} exec {}",
+                                  self.commit_bar, self.exec_bar);
+            }
             Ok(())
         } else {
             logged_err!(self.id; "unexpected log result type or failed truncate")
@@ -1964,12 +1979,12 @@ impl CrosswordReplica {
     /// NOTE: the current implementation does not guard against crashes in the
     /// middle of taking a snapshot.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
+        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
+                           self.start_slot, self.exec_bar);
         assert!(self.exec_bar >= self.start_slot);
         if self.exec_bar == self.start_slot {
             return Ok(());
         }
-        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
-                           self.start_slot, self.exec_bar);
 
         // collect and dump all Puts in executed instances
         if self.is_leader {
@@ -2055,6 +2070,10 @@ impl CrosswordReplica {
                 self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
                     new_start: self.start_slot,
                 })?;
+
+                if self.start_slot > 0 {
+                    pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot);
+                }
                 Ok(())
             }
 
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 4783f4f7..85f01231 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -298,6 +298,7 @@ pub struct MultiPaxosReplica {
 // MultiPaxosReplica common helpers
 impl MultiPaxosReplica {
     /// Create an empty null instance.
+    #[inline]
     fn null_instance(&self) -> Instance {
         Instance {
             bal: 0,
@@ -311,18 +312,32 @@ impl MultiPaxosReplica {
         }
     }
 
+    /// Locate the first null slot or append a null instance if no holes exist.
+    fn first_null_slot(&mut self) -> usize {
+        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
+            if self.insts[s - self.start_slot].status == Status::Null {
+                return s;
+            }
+        }
+        self.insts.push(self.null_instance());
+        self.start_slot + self.insts.len() - 1
+    }
+
     /// Compose a unique ballot number from base.
+    #[inline]
     fn make_unique_ballot(&self, base: u64) -> Ballot {
         ((base << 8) | ((self.id + 1) as u64)) as Ballot
     }
 
     /// Compose a unique ballot number greater than the given one.
+    #[inline]
     fn make_greater_ballot(&self, bal: Ballot) -> Ballot {
         self.make_unique_ballot((bal >> 8) + 1)
     }
 
     /// Compose LogActionId from slot index & entry type.
     /// Uses the `Status` enum type to represent differnet entry types.
+    #[inline]
     fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId {
         let type_num = match entry_type {
             Status::Preparing => 1,
@@ -334,6 +349,7 @@ impl MultiPaxosReplica {
     }
 
     /// Decompose LogActionId into slot index & entry type.
+    #[inline]
     fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) {
         let slot = (log_action_id >> 2) as usize;
         let type_num = log_action_id & ((1 << 2) - 1);
@@ -347,6 +363,7 @@ impl MultiPaxosReplica {
     }
 
     /// Compose CommandId from slot index & command index within.
+    #[inline]
     fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId {
         assert!(slot <= (u32::MAX as usize));
         assert!(cmd_idx <= (u32::MAX as usize));
@@ -354,6 +371,7 @@ impl MultiPaxosReplica {
     }
 
     /// Decompose CommandId into slot index & command index within.
+    #[inline]
     fn split_command_id(command_id: CommandId) -> (usize, usize) {
         let slot = (command_id >> 32) as usize;
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
@@ -394,31 +412,18 @@ impl MultiPaxosReplica {
         }
 
         // create a new instance in the first null slot (or append a new one
-        // at the end if no holes exist)
-        let mut slot = self.start_slot + self.insts.len();
-        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
-            let old_inst = &mut self.insts[s - self.start_slot];
-            if old_inst.status == Status::Null {
-                old_inst.reqs = req_batch.clone();
-                old_inst.leader_bk = Some(LeaderBookkeeping {
-                    prepare_acks: Bitmap::new(self.population, false),
-                    prepare_max_bal: 0,
-                    accept_acks: Bitmap::new(self.population, false),
-                });
-                slot = s;
-                break;
-            }
-        }
-        if slot == self.start_slot + self.insts.len() {
-            let mut new_inst = self.null_instance();
-            new_inst.reqs = req_batch.clone();
-            new_inst.leader_bk = Some(LeaderBookkeeping {
+        // at the end if no holes exist); fill it up with incoming data
+        let slot = self.first_null_slot();
+        {
+            let inst = &mut self.insts[slot - self.start_slot];
+            assert_eq!(inst.status, Status::Null);
+            inst.reqs = req_batch.clone();
+            inst.leader_bk = Some(LeaderBookkeeping {
                 prepare_acks: Bitmap::new(self.population, false),
                 prepare_max_bal: 0,
                 accept_acks: Bitmap::new(self.population, false),
             });
-            new_inst.external = true;
-            self.insts.push(new_inst);
+            inst.external = true;
         }
 
         // decide whether we can enter fast path for this instance
@@ -1338,7 +1343,7 @@ impl MultiPaxosReplica {
 
             LogEntry::CommitSlot { slot } => {
                 assert!(slot < self.start_slot + self.insts.len());
-                // update instance state
+                // update instance status
                 self.insts[slot - self.start_slot].status = Status::Committed;
                 // submit commands in contiguously committed instance to the
                 // state machine
@@ -1358,9 +1363,10 @@ impl MultiPaxosReplica {
                                 let _ = self.state_machine.get_result().await?;
                             }
                         }
-                        // update commit_bar and exec_bar
+                        // update instance status, commit_bar and exec_bar
                         self.commit_bar += 1;
                         self.exec_bar += 1;
+                        inst.status = Status::Executed;
                     }
                 }
             }
@@ -1413,6 +1419,10 @@ impl MultiPaxosReplica {
             offset_ok: true, ..
         } = log_result
         {
+            if self.log_offset > 0 {
+                pf_info!(self.id; "recovered from wal log: commit {} exec {}",
+                                  self.commit_bar, self.exec_bar);
+            }
             Ok(())
         } else {
             logged_err!(self.id; "unexpected log result type or failed truncate")
@@ -1604,6 +1614,10 @@ impl MultiPaxosReplica {
                 self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
                     new_start: self.start_slot,
                 })?;
+
+                if self.start_slot > 0 {
+                    pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot);
+                }
                 Ok(())
             }
 
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index 74ea31a7..af46cc69 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -109,6 +109,7 @@ pub struct RepNothingReplica {
 // RepNothingReplica common helpers
 impl RepNothingReplica {
     /// Compose CommandId from instance index & command index within.
+    #[inline]
     fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId {
         assert!(inst_idx <= (u32::MAX as usize));
         assert!(cmd_idx <= (u32::MAX as usize));
@@ -116,6 +117,7 @@ impl RepNothingReplica {
     }
 
     /// Decompose CommandId into instance index & command index within.
+    #[inline]
     fn split_command_id(command_id: CommandId) -> (usize, usize) {
         let inst_idx = (command_id >> 32) as usize;
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 5d2fb450..73e0efac 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -313,6 +313,7 @@ pub struct RSPaxosReplica {
 // RSPaxosReplica common helpers
 impl RSPaxosReplica {
     /// Create an empty null instance.
+    #[inline]
     fn null_instance(&self) -> Result<Instance, SummersetError> {
         Ok(Instance {
             bal: 0,
@@ -335,18 +336,32 @@ impl RSPaxosReplica {
         })
     }
 
+    /// Locate the first null slot or append a null instance if no holes exist.
+    fn first_null_slot(&mut self) -> Result<usize, SummersetError> {
+        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
+            if self.insts[s - self.start_slot].status == Status::Null {
+                return Ok(s);
+            }
+        }
+        self.insts.push(self.null_instance()?);
+        Ok(self.start_slot + self.insts.len() - 1)
+    }
+
     /// Compose a unique ballot number from base.
+    #[inline]
     fn make_unique_ballot(&self, base: u64) -> Ballot {
         ((base << 8) | ((self.id + 1) as u64)) as Ballot
     }
 
     /// Compose a unique ballot number greater than the given one.
+    #[inline]
     fn make_greater_ballot(&self, bal: Ballot) -> Ballot {
         self.make_unique_ballot((bal >> 8) + 1)
     }
 
     /// Compose LogActionId from slot index & entry type.
     /// Uses the `Status` enum type to represent differnet entry types.
+    #[inline]
     fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId {
         let type_num = match entry_type {
             Status::Preparing => 1,
@@ -358,6 +373,7 @@ impl RSPaxosReplica {
     }
 
     /// Decompose LogActionId into slot index & entry type.
+    #[inline]
     fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) {
         let slot = (log_action_id >> 2) as usize;
         let type_num = log_action_id & ((1 << 2) - 1);
@@ -371,6 +387,7 @@ impl RSPaxosReplica {
     }
 
     /// Compose CommandId from slot index & command index within.
+    #[inline]
     fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId {
         assert!(slot <= (u32::MAX as usize));
         assert!(cmd_idx <= (u32::MAX as usize));
@@ -378,6 +395,7 @@ impl RSPaxosReplica {
     }
 
     /// Decompose CommandId into slot index & command index within.
+    #[inline]
     fn split_command_id(command_id: CommandId) -> (usize, usize) {
         let slot = (command_id >> 32) as usize;
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
@@ -426,33 +444,18 @@ impl RSPaxosReplica {
         reqs_cw.compute_parity(Some(&self.rs_coder))?;
 
         // create a new instance in the first null slot (or append a new one
-        // at the end if no holes exist)
-        let mut slot = self.start_slot + self.insts.len();
-        for s in self.commit_bar..(self.start_slot + self.insts.len()) {
-            if self.insts[s - self.start_slot].status == Status::Null {
-                slot = s;
-                break;
-            }
-        }
-        if slot < self.start_slot + self.insts.len() {
-            let old_inst = &mut self.insts[slot - self.start_slot];
-            assert_eq!(old_inst.status, Status::Null);
-            old_inst.reqs_cw = reqs_cw;
-            old_inst.leader_bk = Some(LeaderBookkeeping {
-                prepare_acks: Bitmap::new(self.population, false),
-                prepare_max_bal: 0,
-                accept_acks: Bitmap::new(self.population, false),
-            });
-        } else {
-            let mut new_inst = self.null_instance()?;
-            new_inst.reqs_cw = reqs_cw;
-            new_inst.leader_bk = Some(LeaderBookkeeping {
+        // at the end if no holes exist); fill it up with incoming data
+        let slot = self.first_null_slot()?;
+        {
+            let inst = &mut self.insts[slot - self.start_slot];
+            assert_eq!(inst.status, Status::Null);
+            inst.reqs_cw = reqs_cw;
+            inst.leader_bk = Some(LeaderBookkeeping {
                 prepare_acks: Bitmap::new(self.population, false),
                 prepare_max_bal: 0,
                 accept_acks: Bitmap::new(self.population, false),
             });
-            new_inst.external = true;
-            self.insts.push(new_inst);
+            inst.external = true;
         }
 
         // decide whether we can enter fast path for this instance
@@ -648,6 +651,8 @@ impl RSPaxosReplica {
                 if inst.status < Status::Committed {
                     break;
                 }
+                let now_slot = self.commit_bar;
+                self.commit_bar += 1;
 
                 if inst.reqs_cw.avail_shards() < self.quorum_cnt {
                     // can't execute if I don't have the complete request batch
@@ -668,7 +673,7 @@ impl RSPaxosReplica {
                     for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
                         if let ApiRequest::Req { cmd, .. } = req {
                             self.state_machine.submit_cmd(
-                                Self::make_command_id(self.commit_bar, cmd_idx),
+                                Self::make_command_id(now_slot, cmd_idx),
                                 cmd.clone(),
                             )?;
                         } else {
@@ -676,10 +681,8 @@ impl RSPaxosReplica {
                         }
                     }
                     pf_trace!(self.id; "submitted {} exec commands for slot {}",
-                                       reqs.len(), self.commit_bar);
+                                       reqs.len(), now_slot);
                 }
-
-                self.commit_bar += 1;
             }
         }
 
@@ -1102,7 +1105,6 @@ impl RSPaxosReplica {
                            peer, slot, ballot, reqs_cw.avail_shards_map());
         assert!(slot < self.start_slot + self.insts.len());
         assert!(self.insts[slot - self.start_slot].status >= Status::Committed);
-        let num_insts = self.start_slot + self.insts.len();
         let inst = &mut self.insts[slot - self.start_slot];
 
         // if reply not outdated and ballot is up-to-date
@@ -1111,10 +1113,10 @@ impl RSPaxosReplica {
             inst.reqs_cw.absorb_other(reqs_cw)?;
 
             // if enough shards have been gathered, can push execution forward
-            if slot == self.commit_bar {
-                while self.commit_bar < num_insts {
-                    let inst =
-                        &mut self.insts[self.commit_bar - self.start_slot];
+            if slot == self.exec_bar {
+                let mut now_slot = self.exec_bar;
+                while now_slot < self.start_slot + self.insts.len() {
+                    let inst = &mut self.insts[now_slot - self.start_slot];
                     if inst.status < Status::Committed
                         || inst.reqs_cw.avail_shards() < self.quorum_cnt
                     {
@@ -1135,10 +1137,7 @@ impl RSPaxosReplica {
                         for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
                             if let ApiRequest::Req { cmd, .. } = req {
                                 self.state_machine.submit_cmd(
-                                    Self::make_command_id(
-                                        self.commit_bar,
-                                        cmd_idx,
-                                    ),
+                                    Self::make_command_id(now_slot, cmd_idx),
                                     cmd.clone(),
                                 )?;
                             } else {
@@ -1146,10 +1145,10 @@ impl RSPaxosReplica {
                             }
                         }
                         pf_trace!(self.id; "submitted {} exec commands for slot {}",
-                                           reqs.len(), self.commit_bar);
+                                           reqs.len(), now_slot);
                     }
 
-                    self.commit_bar += 1;
+                    now_slot += 1;
                 }
             }
         }
@@ -1571,7 +1570,7 @@ impl RSPaxosReplica {
 
             LogEntry::CommitSlot { slot } => {
                 assert!(slot < self.start_slot + self.insts.len());
-                // update instance state
+                // update instance status
                 self.insts[slot - self.start_slot].status = Status::Committed;
                 // submit commands in contiguously committed instance to the
                 // state machine
@@ -1582,6 +1581,8 @@ impl RSPaxosReplica {
                         if inst.status < Status::Committed {
                             break;
                         }
+                        // update commit_bar
+                        self.commit_bar += 1;
                         // check number of available shards
                         if inst.reqs_cw.avail_shards() < self.quorum_cnt {
                             // can't execute if I don't have the complete request batch
@@ -1602,9 +1603,9 @@ impl RSPaxosReplica {
                                 let _ = self.state_machine.get_result().await?;
                             }
                         }
-                        // update commit_bar and exec_bar
-                        self.commit_bar += 1;
+                        // update instance status and exec_bar
                         self.exec_bar += 1;
+                        inst.status = Status::Executed;
                     }
                 }
             }
@@ -1657,6 +1658,10 @@ impl RSPaxosReplica {
             offset_ok: true, ..
         } = log_result
         {
+            if self.log_offset > 0 {
+                pf_info!(self.id; "recovered from wal log: commit {} exec {}",
+                                  self.commit_bar, self.exec_bar);
+            }
             Ok(())
         } else {
             logged_err!(self.id; "unexpected log result type or failed truncate")
@@ -1849,6 +1854,10 @@ impl RSPaxosReplica {
                 self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
                     new_start: self.start_slot,
                 })?;
+
+                if self.start_slot > 0 {
+                    pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot);
+                }
                 Ok(())
             }
 
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index c6a283c4..a0345d7e 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -141,6 +141,7 @@ pub struct SimplePushReplica {
 // SimplePushReplica common helpers
 impl SimplePushReplica {
     /// Compose CommandId from instance index & command index within.
+    #[inline]
     fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId {
         assert!(inst_idx <= (u32::MAX as usize));
         assert!(cmd_idx <= (u32::MAX as usize));
@@ -148,6 +149,7 @@ impl SimplePushReplica {
     }
 
     /// Decompose CommandId into instance index & command index within.
+    #[inline]
     fn split_command_id(command_id: CommandId) -> (usize, usize) {
         let inst_idx = (command_id >> 32) as usize;
         let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;

From f30dd2b7cc3befb72b3e55df979946a399e741d5 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Tue, 3 Oct 2023 23:02:33 -0500
Subject: [PATCH 74/89] fixed snapshotting commit_bar bug

---
 src/protocols/crossword.rs             | 56 ++++++++++++++++++++++----
 src/protocols/multipaxos.rs            | 56 ++++++++++++++++++++++----
 src/protocols/rs_paxos.rs              | 56 ++++++++++++++++++++++----
 summerset_client/src/clients/tester.rs | 21 ++++++++++
 4 files changed, 165 insertions(+), 24 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 11a76c6a..9c5aa4d6 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -192,9 +192,14 @@ enum LogEntry {
 /// Snapshot file entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum SnapEntry {
-    /// First entry at the start of file: number of log instances covered by
-    /// this snapshot file == the start slot index of in-mem log.
-    StartSlot { slot: usize },
+    /// Necessary slot indices to remember.
+    SlotInfo {
+        /// First entry at the start of file: number of log instances covered
+        /// by this snapshot file == the start slot index of in-mem log.
+        start_slot: usize,
+        /// Index of the first non-committed slot.
+        commit_bar: usize,
+    },
 
     /// Set of key-value pairs to apply to the state.
     KVPairSet { pairs: HashMap<String, String> },
@@ -1977,7 +1982,8 @@ impl CrosswordReplica {
     /// to that index as well as outdate entries in the durable WAL log file.
     ///
     /// NOTE: the current implementation does not guard against crashes in the
-    /// middle of taking a snapshot.
+    /// middle of taking a snapshot. Production quality implementations should
+    /// make the snapshotting action "atomic".
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
         pf_debug!(self.id; "taking new snapshot: start {} exec {}",
                            self.start_slot, self.exec_bar);
@@ -1993,6 +1999,28 @@ impl CrosswordReplica {
         }
         self.snapshot_dump_kv_pairs().await?;
 
+        // write new slot info entry to the head of snapshot
+        self.snapshot_hub.submit_action(
+            0,
+            LogAction::Write {
+                entry: SnapEntry::SlotInfo {
+                    start_slot: self.exec_bar,
+                    commit_bar: self.commit_bar,
+                },
+                offset: 0,
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        match log_result {
+            LogResult::Write {
+                offset_ok: true, ..
+            } => {}
+            _ => {
+                return logged_err!(self.id; "unexpected log result type or failed truncate");
+            }
+        }
+
         // update start_slot and discard all in-memory log instances up to exec_bar
         self.insts.drain(0..(self.exec_bar - self.start_slot));
         self.start_slot = self.exec_bar;
@@ -2023,11 +2051,19 @@ impl CrosswordReplica {
 
         match log_result {
             LogResult::Read {
-                entry: Some(SnapEntry::StartSlot { slot }),
+                entry:
+                    Some(SnapEntry::SlotInfo {
+                        start_slot,
+                        commit_bar,
+                    }),
                 end_offset,
             } => {
                 self.snap_offset = end_offset;
-                self.start_slot = slot; // get start slot index of in-mem log
+
+                // recover necessary slot indices info
+                self.start_slot = start_slot;
+                self.commit_bar = commit_bar;
+                self.exec_bar = start_slot;
 
                 // repeatedly apply key-value pairs
                 loop {
@@ -2072,7 +2108,8 @@ impl CrosswordReplica {
                 })?;
 
                 if self.start_slot > 0 {
-                    pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot);
+                    pf_info!(self.id; "recovered from snapshot: start {} commit {} exec {}",
+                                      self.start_slot, self.commit_bar, self.exec_bar);
                 }
                 Ok(())
             }
@@ -2082,7 +2119,10 @@ impl CrosswordReplica {
                 self.snapshot_hub.submit_action(
                     0,
                     LogAction::Write {
-                        entry: SnapEntry::StartSlot { slot: 0 },
+                        entry: SnapEntry::SlotInfo {
+                            start_slot: 0,
+                            commit_bar: 0,
+                        },
                         offset: 0,
                         sync: self.config.logger_sync,
                     },
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 85f01231..fbcfc682 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -176,9 +176,14 @@ enum LogEntry {
 /// Snapshot file entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum SnapEntry {
-    /// First entry at the start of file: number of log instances covered by
-    /// this snapshot file == the start slot index of in-mem log.
-    StartSlot { slot: usize },
+    /// Necessary slot indices to remember.
+    SlotInfo {
+        /// First entry at the start of file: number of log instances covered
+        /// by this snapshot file == the start slot index of in-mem log.
+        start_slot: usize,
+        /// Index of the first non-committed slot.
+        commit_bar: usize,
+    },
 
     /// Set of key-value pairs to apply to the state.
     KVPairSet { pairs: HashMap<String, String> },
@@ -1521,7 +1526,8 @@ impl MultiPaxosReplica {
     /// to that index as well as outdate entries in the durable WAL log file.
     ///
     /// NOTE: the current implementation does not guard against crashes in the
-    /// middle of taking a snapshot.
+    /// middle of taking a snapshot. Production quality implementations should
+    /// make the snapshotting action "atomic".
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
         pf_debug!(self.id; "taking new snapshot: start {} exec {}",
                            self.start_slot, self.exec_bar);
@@ -1537,6 +1543,28 @@ impl MultiPaxosReplica {
         }
         self.snapshot_dump_kv_pairs().await?;
 
+        // write new slot info entry to the head of snapshot
+        self.snapshot_hub.submit_action(
+            0,
+            LogAction::Write {
+                entry: SnapEntry::SlotInfo {
+                    start_slot: self.exec_bar,
+                    commit_bar: self.commit_bar,
+                },
+                offset: 0,
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        match log_result {
+            LogResult::Write {
+                offset_ok: true, ..
+            } => {}
+            _ => {
+                return logged_err!(self.id; "unexpected log result type or failed truncate");
+            }
+        }
+
         // update start_slot and discard all in-memory log instances up to exec_bar
         self.insts.drain(0..(self.exec_bar - self.start_slot));
         self.start_slot = self.exec_bar;
@@ -1567,11 +1595,19 @@ impl MultiPaxosReplica {
 
         match log_result {
             LogResult::Read {
-                entry: Some(SnapEntry::StartSlot { slot }),
+                entry:
+                    Some(SnapEntry::SlotInfo {
+                        start_slot,
+                        commit_bar,
+                    }),
                 end_offset,
             } => {
                 self.snap_offset = end_offset;
-                self.start_slot = slot; // get start slot index of in-mem log
+
+                // recover necessary slot indices info
+                self.start_slot = start_slot;
+                self.commit_bar = commit_bar;
+                self.exec_bar = start_slot;
 
                 // repeatedly apply key-value pairs
                 loop {
@@ -1616,7 +1652,8 @@ impl MultiPaxosReplica {
                 })?;
 
                 if self.start_slot > 0 {
-                    pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot);
+                    pf_info!(self.id; "recovered from snapshot: start {} commit {} exec {}",
+                                      self.start_slot, self.commit_bar, self.exec_bar);
                 }
                 Ok(())
             }
@@ -1626,7 +1663,10 @@ impl MultiPaxosReplica {
                 self.snapshot_hub.submit_action(
                     0,
                     LogAction::Write {
-                        entry: SnapEntry::StartSlot { slot: 0 },
+                        entry: SnapEntry::SlotInfo {
+                            start_slot: 0,
+                            commit_bar: 0,
+                        },
                         offset: 0,
                         sync: self.config.logger_sync,
                     },
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 73e0efac..3b5c25b2 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -178,9 +178,14 @@ enum LogEntry {
 /// Snapshot file entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum SnapEntry {
-    /// First entry at the start of file: number of log instances covered by
-    /// this snapshot file == the start slot index of in-mem log.
-    StartSlot { slot: usize },
+    /// Necessary slot indices to remember.
+    SlotInfo {
+        /// First entry at the start of file: number of log instances covered
+        /// by this snapshot file == the start slot index of in-mem log.
+        start_slot: usize,
+        /// Index of the first non-committed slot.
+        commit_bar: usize,
+    },
 
     /// Set of key-value pairs to apply to the state.
     KVPairSet { pairs: HashMap<String, String> },
@@ -1761,7 +1766,8 @@ impl RSPaxosReplica {
     /// to that index as well as outdate entries in the durable WAL log file.
     ///
     /// NOTE: the current implementation does not guard against crashes in the
-    /// middle of taking a snapshot.
+    /// middle of taking a snapshot. Production quality implementations should
+    /// make the snapshotting action "atomic".
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
         pf_debug!(self.id; "taking new snapshot: start {} exec {}",
                            self.start_slot, self.exec_bar);
@@ -1777,6 +1783,28 @@ impl RSPaxosReplica {
         }
         self.snapshot_dump_kv_pairs().await?;
 
+        // write new slot info entry to the head of snapshot
+        self.snapshot_hub.submit_action(
+            0,
+            LogAction::Write {
+                entry: SnapEntry::SlotInfo {
+                    start_slot: self.exec_bar,
+                    commit_bar: self.commit_bar,
+                },
+                offset: 0,
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        match log_result {
+            LogResult::Write {
+                offset_ok: true, ..
+            } => {}
+            _ => {
+                return logged_err!(self.id; "unexpected log result type or failed truncate");
+            }
+        }
+
         // update start_slot and discard all in-memory log instances up to exec_bar
         self.insts.drain(0..(self.exec_bar - self.start_slot));
         self.start_slot = self.exec_bar;
@@ -1807,11 +1835,19 @@ impl RSPaxosReplica {
 
         match log_result {
             LogResult::Read {
-                entry: Some(SnapEntry::StartSlot { slot }),
+                entry:
+                    Some(SnapEntry::SlotInfo {
+                        start_slot,
+                        commit_bar,
+                    }),
                 end_offset,
             } => {
                 self.snap_offset = end_offset;
-                self.start_slot = slot; // get start slot index of in-mem log
+
+                // recover necessary slot indices info
+                self.start_slot = start_slot;
+                self.commit_bar = commit_bar;
+                self.exec_bar = start_slot;
 
                 // repeatedly apply key-value pairs
                 loop {
@@ -1856,7 +1892,8 @@ impl RSPaxosReplica {
                 })?;
 
                 if self.start_slot > 0 {
-                    pf_info!(self.id; "recovered from snapshot: start {}", self.start_slot);
+                    pf_info!(self.id; "recovered from snapshot: start {} commit {} exec {}",
+                                      self.start_slot, self.commit_bar, self.exec_bar);
                 }
                 Ok(())
             }
@@ -1866,7 +1903,10 @@ impl RSPaxosReplica {
                 self.snapshot_hub.submit_action(
                     0,
                     LogAction::Write {
-                        entry: SnapEntry::StartSlot { slot: 0 },
+                        entry: SnapEntry::SlotInfo {
+                            start_slot: 0,
+                            commit_bar: 0,
+                        },
                         offset: 0,
                         sync: self.config.logger_sync,
                     },
diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs
index 378256b7..4fb021e0 100644
--- a/summerset_client/src/clients/tester.rs
+++ b/summerset_client/src/clients/tester.rs
@@ -467,6 +467,7 @@ impl ClientTester {
         self.checked_put("Jose", &v, Some(None), 0).await?;
         for (s, is_leader) in self.query_servers().await? {
             if !is_leader {
+                // picked a non-leader replica
                 self.driver.leave(false).await?;
                 self.reset_servers(HashSet::from([s]), true).await?;
                 time::sleep(Duration::from_secs(1)).await;
@@ -484,6 +485,7 @@ impl ClientTester {
         self.checked_put("Jose", &v, Some(None), 0).await?;
         for (s, is_leader) in self.query_servers().await? {
             if is_leader {
+                // picked a leader replica
                 self.driver.leave(false).await?;
                 self.reset_servers(HashSet::from([s]), true).await?;
                 time::sleep(Duration::from_secs(1)).await;
@@ -515,6 +517,7 @@ impl ClientTester {
             }
         }
         if resets.len() == 2 {
+            // picked two replicas, one leader and one non-leader
             self.driver.leave(false).await?;
             self.reset_servers(resets, true).await?;
             time::sleep(Duration::from_secs(1)).await;
@@ -543,6 +546,7 @@ impl ClientTester {
         time::sleep(Duration::from_millis(500)).await;
         for (s, is_leader) in self.query_servers().await? {
             if !is_leader {
+                // picked a non-leader replica
                 self.driver.leave(false).await?;
                 self.pause_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
@@ -563,6 +567,7 @@ impl ClientTester {
         time::sleep(Duration::from_millis(500)).await;
         for (s, is_leader) in self.query_servers().await? {
             if is_leader {
+                // picked a leader replica
                 self.driver.leave(false).await?;
                 self.pause_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
@@ -583,24 +588,28 @@ impl ClientTester {
         time::sleep(Duration::from_millis(500)).await;
         for (s, is_leader) in self.query_servers().await? {
             if is_leader {
+                // picked a leader replica
                 self.driver.leave(false).await?;
                 self.pause_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
                 self.driver.connect().await?;
                 let v1 = Self::gen_rand_string(8);
                 self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?;
+                // resuming old leader replica
                 self.driver.leave(false).await?;
                 self.resume_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
                 self.driver.connect().await?;
                 let v2 = Self::gen_rand_string(8);
                 self.checked_put("Jose", &v2, Some(Some(&v1)), 1).await?;
+                // pausing that replica again
                 self.driver.leave(false).await?;
                 self.pause_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
                 self.driver.connect().await?;
                 let v3 = Self::gen_rand_string(8);
                 self.checked_put("Jose", &v3, Some(Some(&v2)), 0).await?;
+                // resuming that replica again
                 self.driver.leave(false).await?;
                 self.resume_servers(HashSet::from([s])).await?;
                 time::sleep(Duration::from_secs(1)).await;
@@ -619,9 +628,21 @@ impl ClientTester {
         self.checked_put("Jose", &v0, Some(None), 0).await?;
         let v1 = Self::gen_rand_string(8);
         self.checked_put("Shawn", &v1, Some(None), 0).await?;
+        // forcing all nodes to take snapshot
         time::sleep(Duration::from_millis(500)).await;
         self.force_snapshot(HashSet::new()).await?;
         self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?;
+        // reseting all nodes and see if things are there
+        self.driver.leave(false).await?;
+        self.reset_servers(HashSet::new(), true).await?;
+        time::sleep(Duration::from_secs(1)).await;
+        self.driver.connect().await?;
+        self.checked_get("Shawn", Some(Some(&v1)), 0).await?;
+        self.checked_get("Jose", Some(Some(&v1)), 0).await?;
+        // forcing all nodes to take snapshot again
+        time::sleep(Duration::from_millis(500)).await;
+        self.force_snapshot(HashSet::new()).await?;
+        // reseting all nodes again and check again
         self.driver.leave(false).await?;
         self.reset_servers(HashSet::new(), true).await?;
         time::sleep(Duration::from_secs(1)).await;

From 8dd10a28a923e6da3f0b26aa8d903a8b9cc1386c Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Wed, 4 Oct 2023 01:21:39 -0500
Subject: [PATCH 75/89] add chunking to reconstruction read messages

---
 src/protocols/crossword.rs | 328 +++++++++++++++++++++++--------------
 src/protocols/rs_paxos.rs  | 200 ++++++++++++----------
 src/server/transport.rs    |  11 +-
 3 files changed, 318 insertions(+), 221 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 9c5aa4d6..1963f016 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -69,6 +69,9 @@ pub struct ReplicaConfigCrossword {
     /// Fault-tolerance level.
     pub fault_tolerance: u8,
 
+    /// Maximum chunk size of a ReconstructRead message.
+    pub recon_chunk_size: usize,
+
     /// Number of shards to assign to each replica.
     // TODO: proper config options.
     pub shards_per_replica: u8,
@@ -96,6 +99,7 @@ impl Default for ReplicaConfigCrossword {
             gossip_timeout_min: 100,
             gossip_timeout_max: 300,
             fault_tolerance: 0,
+            recon_chunk_size: 1000,
             shards_per_replica: 1,
             perf_storage_a: 0,
             perf_storage_b: 0,
@@ -234,13 +238,15 @@ enum PeerMsg {
     Commit { slot: usize },
 
     /// Reconstruction read from new leader to replicas.
-    Reconstruct { slot: usize, exclude: Vec<u8> },
+    Reconstruct {
+        /// Map from slot -> shards to exclude.
+        slots_excl: HashMap<usize, Vec<u8>>,
+    },
 
     /// Reconstruction read reply from replica to leader.
     ReconstructReply {
-        slot: usize,
-        ballot: Ballot,
-        reqs_cw: RSCodeword<ReqBatch>,
+        /// Map from slot -> (ballot, peer shards).
+        slots_data: HashMap<usize, (Ballot, RSCodeword<ReqBatch>)>,
     },
 
     /// Leader activity heartbeat.
@@ -1183,43 +1189,44 @@ impl CrosswordReplica {
     fn handle_msg_reconstruct(
         &mut self,
         peer: ReplicaId,
-        slot: usize,
-        exclude: Vec<u8>,
+        slots_excl: HashMap<usize, Vec<u8>>,
     ) -> Result<(), SummersetError> {
-        if slot < self.start_slot {
-            return Ok(()); // ignore if slot index outdated
-        }
-        pf_trace!(self.id; "received Reconstruct <- {} for slot {}", peer, slot);
+        pf_trace!(self.id; "received Reconstruct <- {} for {} slots",
+                           peer, slots_excl.len());
+        let mut slots_data = HashMap::new();
 
-        // locate instance in memory, filling in null instances if needed
-        while self.start_slot + self.insts.len() <= slot {
-            self.insts.push(self.null_instance()?);
-        }
-        let inst = &mut self.insts[slot - self.start_slot];
+        for (slot, exclude) in slots_excl {
+            if slot < self.start_slot {
+                continue; // ignore if slot index outdated
+            }
 
-        // ignore spurious duplications; also ignore if I have nothing to send back
-        if inst.status < Status::Accepting {
-            return Ok(());
-        }
-        let mut subset = Bitmap::from(inst.reqs_cw.num_shards(), exclude);
-        subset.flip(); // exclude unwanted shards the sender already has
-        let reply_cw = inst.reqs_cw.subset_copy(subset, false)?;
-        if reply_cw.avail_shards() == 0 {
-            return Ok(());
-        }
+            // locate instance in memory, filling in null instances if needed
+            while self.start_slot + self.insts.len() <= slot {
+                self.insts.push(self.null_instance()?);
+            }
+            let inst = &mut self.insts[slot - self.start_slot];
 
-        // send back my ballot for this slot and the available shards
-        self.transport_hub.send_msg(
-            PeerMsg::ReconstructReply {
-                slot,
-                ballot: inst.bal,
-                reqs_cw: reply_cw.clone(),
-            },
-            peer,
-        )?;
-        pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}",
-                           slot, inst.bal);
+            // ignore spurious duplications; also ignore if I have nothing to send back
+            if inst.status < Status::Accepting {
+                continue;
+            }
+            let mut subset = Bitmap::from(inst.reqs_cw.num_shards(), exclude);
+            subset.flip(); // exclude unwanted shards the sender already has
+            let reply_cw = inst.reqs_cw.subset_copy(subset, false)?;
+            if reply_cw.avail_shards() == 0 {
+                continue;
+            }
 
+            // send back my ballot for this slot and the available shards
+            slots_data.insert(slot, (inst.bal, reply_cw));
+        }
+
+        if !slots_data.is_empty() {
+            let num_slots = slots_data.len();
+            self.transport_hub
+                .send_msg(PeerMsg::ReconstructReply { slots_data }, peer)?;
+            pf_trace!(self.id; "sent ReconstructReply message for {} slots", num_slots);
+        }
         Ok(())
     }
 
@@ -1227,61 +1234,66 @@ impl CrosswordReplica {
     fn handle_msg_reconstruct_reply(
         &mut self,
         peer: ReplicaId,
-        slot: usize,
-        ballot: Ballot,
-        reqs_cw: RSCodeword<ReqBatch>,
+        slots_data: HashMap<usize, (Ballot, RSCodeword<ReqBatch>)>,
     ) -> Result<(), SummersetError> {
-        if slot < self.start_slot {
-            return Ok(()); // ignore if slot index outdated
-        }
-        pf_trace!(self.id; "received ReconstructReply <- {} for slot {} bal {} shards {:?}",
-                           peer, slot, ballot, reqs_cw.avail_shards_map());
-        assert!(slot < self.start_slot + self.insts.len());
-        assert!(self.insts[slot - self.start_slot].status >= Status::Committed);
-        let inst = &mut self.insts[slot - self.start_slot];
-
-        // if reply not outdated and ballot is up-to-date
-        if inst.status < Status::Executed && ballot >= inst.bal {
-            // absorb the shards from this replica
-            inst.reqs_cw.absorb_other(reqs_cw)?;
+        for (slot, (ballot, reqs_cw)) in slots_data {
+            if slot < self.start_slot {
+                continue; // ignore if slot index outdated
+            }
+            pf_trace!(self.id; "in ReconstructReply <- {} for slot {} bal {} shards {:?}",
+                               peer, slot, ballot, reqs_cw.avail_shards_map());
+            assert!(slot < self.start_slot + self.insts.len());
+            assert!(
+                self.insts[slot - self.start_slot].status >= Status::Committed
+            );
+            let inst = &mut self.insts[slot - self.start_slot];
 
-            // if enough shards have been gathered, can push execution forward
-            if slot == self.exec_bar {
-                let mut now_slot = self.exec_bar;
-                while now_slot < self.start_slot + self.insts.len() {
-                    let inst = &mut self.insts[now_slot - self.start_slot];
-                    if inst.status < Status::Committed
-                        || inst.reqs_cw.avail_shards() < self.quorum_cnt
-                    {
-                        break;
-                    }
+            // if reply not outdated and ballot is up-to-date
+            if inst.status < Status::Executed && ballot >= inst.bal {
+                // absorb the shards from this replica
+                inst.reqs_cw.absorb_other(reqs_cw)?;
+
+                // if enough shards have been gathered, can push execution forward
+                if slot == self.exec_bar {
+                    let mut now_slot = self.exec_bar;
+                    while now_slot < self.start_slot + self.insts.len() {
+                        let inst = &mut self.insts[now_slot - self.start_slot];
+                        if inst.status < Status::Committed
+                            || inst.reqs_cw.avail_shards() < self.quorum_cnt
+                        {
+                            break;
+                        }
 
-                    if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
-                        // have enough shards but need reconstruction
-                        inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
-                    }
-                    let reqs = inst.reqs_cw.get_data()?;
+                        if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                            // have enough shards but need reconstruction
+                            inst.reqs_cw
+                                .reconstruct_data(Some(&self.rs_coder))?;
+                        }
+                        let reqs = inst.reqs_cw.get_data()?;
 
-                    // submit commands in committed instance to the state machine
-                    // for execution
-                    if reqs.is_empty() {
-                        inst.status = Status::Executed;
-                    } else {
-                        for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
-                            if let ApiRequest::Req { cmd, .. } = req {
-                                self.state_machine.submit_cmd(
-                                    Self::make_command_id(now_slot, cmd_idx),
-                                    cmd.clone(),
-                                )?;
-                            } else {
-                                continue; // ignore other types of requests
+                        // submit commands in committed instance to the state machine
+                        // for execution
+                        if reqs.is_empty() {
+                            inst.status = Status::Executed;
+                        } else {
+                            for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
+                                if let ApiRequest::Req { cmd, .. } = req {
+                                    self.state_machine.submit_cmd(
+                                        Self::make_command_id(
+                                            now_slot, cmd_idx,
+                                        ),
+                                        cmd.clone(),
+                                    )?;
+                                } else {
+                                    continue; // ignore other types of requests
+                                }
                             }
+                            pf_trace!(self.id; "submitted {} exec commands for slot {}",
+                                               reqs.len(), now_slot);
                         }
-                        pf_trace!(self.id; "submitted {} exec commands for slot {}",
-                                           reqs.len(), now_slot);
-                    }
 
-                    now_slot += 1;
+                        now_slot += 1;
+                    }
                 }
             }
         }
@@ -1313,14 +1325,12 @@ impl CrosswordReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
-            PeerMsg::Reconstruct { slot, exclude } => {
-                self.handle_msg_reconstruct(peer, slot, exclude)
+            PeerMsg::Reconstruct { slots_excl } => {
+                self.handle_msg_reconstruct(peer, slots_excl)
+            }
+            PeerMsg::ReconstructReply { slots_data } => {
+                self.handle_msg_reconstruct_reply(peer, slots_data)
             }
-            PeerMsg::ReconstructReply {
-                slot,
-                ballot,
-                reqs_cw,
-            } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw),
             PeerMsg::Heartbeat { ballot, exec_bar } => {
                 self.heard_heartbeat(peer, ballot, exec_bar)
             }
@@ -1412,6 +1422,7 @@ impl CrosswordReplica {
         self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
         self.bal_max_seen = self.bal_prep_sent;
 
+        let mut recon_slots: HashMap<usize, Vec<u8>> = HashMap::new();
         for (slot, inst) in self
             .insts
             .iter_mut()
@@ -1461,31 +1472,48 @@ impl CrosswordReplica {
             if inst.status == Status::Committed
                 && inst.reqs_cw.avail_shards() < self.quorum_cnt
             {
-                self.transport_hub.bcast_msg(
-                    PeerMsg::Reconstruct {
-                        slot,
-                        exclude: inst
-                            .reqs_cw
-                            .avail_shards_map()
-                            .iter()
-                            .filter_map(
-                                |(idx, flag)| {
-                                    if flag {
-                                        Some(idx)
-                                    } else {
-                                        None
-                                    }
-                                },
-                            )
-                            .collect(),
-                    },
-                    None,
-                )?;
-                pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}",
-                                   slot, inst.bal, inst.reqs_cw.avail_shards_map());
+                recon_slots.insert(
+                    slot,
+                    inst.reqs_cw
+                        .avail_shards_map()
+                        .iter()
+                        .filter_map(
+                            |(idx, flag)| {
+                                if flag {
+                                    Some(idx)
+                                } else {
+                                    None
+                                }
+                            },
+                        )
+                        .collect(),
+                );
+
+                // send reconstruction read messages in chunks
+                if recon_slots.len() == self.config.recon_chunk_size {
+                    self.transport_hub.bcast_msg(
+                        PeerMsg::Reconstruct {
+                            slots_excl: std::mem::take(&mut recon_slots),
+                        },
+                        None,
+                    )?;
+                    pf_trace!(self.id; "broadcast Reconstruct messages for {} slots",
+                                       self.config.recon_chunk_size);
+                }
             }
         }
 
+        // send reconstruction read message for remaining slots
+        if !recon_slots.is_empty() {
+            let num_slots = recon_slots.len();
+            self.transport_hub.bcast_msg(
+                PeerMsg::Reconstruct {
+                    slots_excl: recon_slots,
+                },
+                None,
+            )?;
+            pf_trace!(self.id; "broadcast Reconstruct messages for {} slots", num_slots);
+        }
         Ok(())
     }
 
@@ -1565,8 +1593,17 @@ impl CrosswordReplica {
     /// Triggers gossiping for my missing shards in committed but not-yet-
     /// executed instances: fetch missing shards from peers, preferring
     /// follower peers that hold data shards.
+    // TODO: prefer replicas with original data shards first
     fn trigger_gossiping(&mut self) -> Result<(), SummersetError> {
-        // TODO: want cleverer design than this!
+        // maintain a map from peer ID to send to -> slots_excl to send
+        let mut recon_slots: HashMap<ReplicaId, HashMap<usize, Vec<u8>>> =
+            HashMap::new();
+        for peer in 0..self.population {
+            if peer != self.id {
+                recon_slots.insert(peer, HashMap::new());
+            }
+        }
+
         let mut slot_up_to = self.exec_bar;
         for slot in self.exec_bar..(self.start_slot + self.insts.len()) {
             slot_up_to = slot;
@@ -1578,16 +1615,21 @@ impl CrosswordReplica {
             }
 
             if inst.reqs_cw.avail_shards() < self.quorum_cnt {
-                let mut target = Bitmap::new(self.population, true);
-                if let Some(ReplicaBookkeeping { source }) = inst.replica_bk {
-                    // skip leader who initially replicated this instance to me
-                    target.set(source, false)?;
-                }
-                self.transport_hub.bcast_msg(
-                    PeerMsg::Reconstruct {
+                for peer in 0..self.population {
+                    if peer == self.id {
+                        continue;
+                    }
+                    if let Some(ReplicaBookkeeping { source }) = inst.replica_bk
+                    {
+                        if peer == source {
+                            // skip leader who initially replicated this instance to me
+                            continue;
+                        }
+                    }
+
+                    recon_slots.get_mut(&peer).unwrap().insert(
                         slot,
-                        exclude: inst
-                            .reqs_cw
+                        inst.reqs_cw
                             .avail_shards_map()
                             .iter()
                             .filter_map(
@@ -1600,9 +1642,33 @@ impl CrosswordReplica {
                                 },
                             )
                             .collect(),
-                    },
-                    Some(target),
-                )?;
+                    );
+
+                    // send reconstruction read messages in chunks
+                    if recon_slots[&peer].len() == self.config.recon_chunk_size
+                    {
+                        self.transport_hub.send_msg(
+                            PeerMsg::Reconstruct {
+                                slots_excl: std::mem::take(
+                                    recon_slots.get_mut(&peer).unwrap(),
+                                ),
+                            },
+                            peer,
+                        )?;
+                        pf_trace!(self.id; "sent Reconstruct -> {} for {} slots",
+                                           peer, self.config.recon_chunk_size);
+                    }
+                }
+            }
+        }
+
+        // send reconstruction read message for remaining slots
+        for (peer, slots_excl) in recon_slots.drain() {
+            if !slots_excl.is_empty() {
+                let num_slots = slots_excl.len();
+                self.transport_hub
+                    .send_msg(PeerMsg::Reconstruct { slots_excl }, peer)?;
+                pf_trace!(self.id; "sent Reconstruct -> {} for {} slots", peer, num_slots);
             }
         }
 
@@ -2168,7 +2234,8 @@ impl GenericReplica for CrosswordReplica {
                                     hb_send_interval_ms,
                                     snapshot_path, snapshot_interval_s,
                                     gossip_timeout_min, gossip_timeout_max,
-                                    fault_tolerance, shards_per_replica,
+                                    fault_tolerance, recon_chunk_size,
+                                    shards_per_replica,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
         if config.batch_interval_us == 0 {
@@ -2199,6 +2266,13 @@ impl GenericReplica for CrosswordReplica {
                 config.hb_send_interval_ms
             );
         }
+        if config.recon_chunk_size == 0 {
+            return logged_err!(
+                id;
+                "invalid config.recon_chunk_size '{}'",
+                config.recon_chunk_size
+            );
+        }
 
         // setup state machine module
         let state_machine = StateMachine::new_and_setup(id).await?;
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 3b5c25b2..19399535 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -63,6 +63,9 @@ pub struct ReplicaConfigRSPaxos {
     /// Fault-tolerance level.
     pub fault_tolerance: u8,
 
+    /// Maximum chunk size of a ReconstructRead message.
+    pub recon_chunk_size: usize,
+
     // Performance simulation params (all zeros means no perf simulation):
     pub perf_storage_a: u64,
     pub perf_storage_b: u64,
@@ -84,6 +87,7 @@ impl Default for ReplicaConfigRSPaxos {
             snapshot_path: "/tmp/summerset.rs_paxos.snap".into(),
             snapshot_interval_s: 0,
             fault_tolerance: 0,
+            recon_chunk_size: 1000,
             perf_storage_a: 0,
             perf_storage_b: 0,
             perf_network_a: 0,
@@ -220,13 +224,12 @@ enum PeerMsg {
     Commit { slot: usize },
 
     /// Reconstruction read from new leader to replicas.
-    Reconstruct { slot: usize },
+    Reconstruct { slots: Vec<usize> },
 
     /// Reconstruction read reply from replica to leader.
     ReconstructReply {
-        slot: usize,
-        ballot: Ballot,
-        reqs_cw: RSCodeword<ReqBatch>,
+        /// Map from slot -> (ballot, peer shards).
+        slots_data: HashMap<usize, (Ballot, RSCodeword<ReqBatch>)>,
     },
 
     /// Leader activity heartbeat.
@@ -1062,36 +1065,39 @@ impl RSPaxosReplica {
     fn handle_msg_reconstruct(
         &mut self,
         peer: ReplicaId,
-        slot: usize,
+        slots: Vec<usize>,
     ) -> Result<(), SummersetError> {
-        if slot < self.start_slot {
-            return Ok(()); // ignore if slot index outdated
-        }
-        pf_trace!(self.id; "received Reconstruct <- {} for slot {}", peer, slot);
+        pf_trace!(self.id; "received Reconstruct <- {} for slots {:?}", peer, slots);
+        let mut slots_data = HashMap::new();
 
-        // locate instance in memory, filling in null instances if needed
-        while self.start_slot + self.insts.len() <= slot {
-            self.insts.push(self.null_instance()?);
-        }
-        let inst = &mut self.insts[slot - self.start_slot];
+        for slot in slots {
+            if slot < self.start_slot {
+                continue; // ignore if slot index outdated
+            }
 
-        // ignore spurious duplications; also ignore if I have nothing to send back
-        if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 {
-            return Ok(());
-        }
+            // locate instance in memory, filling in null instances if needed
+            while self.start_slot + self.insts.len() <= slot {
+                self.insts.push(self.null_instance()?);
+            }
+            let inst = &mut self.insts[slot - self.start_slot];
 
-        // send back my ballot for this slot and the available shards
-        self.transport_hub.send_msg(
-            PeerMsg::ReconstructReply {
-                slot,
-                ballot: inst.bal,
-                reqs_cw: inst.reqs_cw.clone(),
-            },
-            peer,
-        )?;
-        pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}",
-                           slot, inst.bal);
+            // ignore spurious duplications; also ignore if I have nothing to send back
+            if inst.status < Status::Accepting
+                || inst.reqs_cw.avail_shards() == 0
+            {
+                continue;
+            }
+
+            // send back my ballot for this slot and the available shards
+            slots_data.insert(slot, (inst.bal, inst.reqs_cw.clone()));
+        }
 
+        if !slots_data.is_empty() {
+            let num_slots = slots_data.len();
+            self.transport_hub
+                .send_msg(PeerMsg::ReconstructReply { slots_data }, peer)?;
+            pf_trace!(self.id; "sent ReconstructReply message for {} slots", num_slots);
+        }
         Ok(())
     }
 
@@ -1099,61 +1105,66 @@ impl RSPaxosReplica {
     fn handle_msg_reconstruct_reply(
         &mut self,
         peer: ReplicaId,
-        slot: usize,
-        ballot: Ballot,
-        reqs_cw: RSCodeword<ReqBatch>,
+        slots_data: HashMap<usize, (Ballot, RSCodeword<ReqBatch>)>,
     ) -> Result<(), SummersetError> {
-        if slot < self.start_slot {
-            return Ok(()); // ignore if slot index outdated
-        }
-        pf_trace!(self.id; "received ReconstructReply <- {} for slot {} bal {} shards {:?}",
-                           peer, slot, ballot, reqs_cw.avail_shards_map());
-        assert!(slot < self.start_slot + self.insts.len());
-        assert!(self.insts[slot - self.start_slot].status >= Status::Committed);
-        let inst = &mut self.insts[slot - self.start_slot];
-
-        // if reply not outdated and ballot is up-to-date
-        if inst.status < Status::Executed && ballot >= inst.bal {
-            // absorb the shards from this replica
-            inst.reqs_cw.absorb_other(reqs_cw)?;
+        for (slot, (ballot, reqs_cw)) in slots_data {
+            if slot < self.start_slot {
+                continue; // ignore if slot index outdated
+            }
+            pf_trace!(self.id; "in ReconstructReply <- {} for slot {} bal {} shards {:?}",
+                               peer, slot, ballot, reqs_cw.avail_shards_map());
+            assert!(slot < self.start_slot + self.insts.len());
+            assert!(
+                self.insts[slot - self.start_slot].status >= Status::Committed
+            );
+            let inst = &mut self.insts[slot - self.start_slot];
 
-            // if enough shards have been gathered, can push execution forward
-            if slot == self.exec_bar {
-                let mut now_slot = self.exec_bar;
-                while now_slot < self.start_slot + self.insts.len() {
-                    let inst = &mut self.insts[now_slot - self.start_slot];
-                    if inst.status < Status::Committed
-                        || inst.reqs_cw.avail_shards() < self.quorum_cnt
-                    {
-                        break;
-                    }
+            // if reply not outdated and ballot is up-to-date
+            if inst.status < Status::Executed && ballot >= inst.bal {
+                // absorb the shards from this replica
+                inst.reqs_cw.absorb_other(reqs_cw)?;
+
+                // if enough shards have been gathered, can push execution forward
+                if slot == self.exec_bar {
+                    let mut now_slot = self.exec_bar;
+                    while now_slot < self.start_slot + self.insts.len() {
+                        let inst = &mut self.insts[now_slot - self.start_slot];
+                        if inst.status < Status::Committed
+                            || inst.reqs_cw.avail_shards() < self.quorum_cnt
+                        {
+                            break;
+                        }
 
-                    if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
-                        // have enough shards but need reconstruction
-                        inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
-                    }
-                    let reqs = inst.reqs_cw.get_data()?;
+                        if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                            // have enough shards but need reconstruction
+                            inst.reqs_cw
+                                .reconstruct_data(Some(&self.rs_coder))?;
+                        }
+                        let reqs = inst.reqs_cw.get_data()?;
 
-                    // submit commands in committed instance to the state machine
-                    // for execution
-                    if reqs.is_empty() {
-                        inst.status = Status::Executed;
-                    } else {
-                        for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
-                            if let ApiRequest::Req { cmd, .. } = req {
-                                self.state_machine.submit_cmd(
-                                    Self::make_command_id(now_slot, cmd_idx),
-                                    cmd.clone(),
-                                )?;
-                            } else {
-                                continue; // ignore other types of requests
+                        // submit commands in committed instance to the state machine
+                        // for execution
+                        if reqs.is_empty() {
+                            inst.status = Status::Executed;
+                        } else {
+                            for (cmd_idx, (_, req)) in reqs.iter().enumerate() {
+                                if let ApiRequest::Req { cmd, .. } = req {
+                                    self.state_machine.submit_cmd(
+                                        Self::make_command_id(
+                                            now_slot, cmd_idx,
+                                        ),
+                                        cmd.clone(),
+                                    )?;
+                                } else {
+                                    continue; // ignore other types of requests
+                                }
                             }
+                            pf_trace!(self.id; "submitted {} exec commands for slot {}",
+                                               reqs.len(), now_slot);
                         }
-                        pf_trace!(self.id; "submitted {} exec commands for slot {}",
-                                           reqs.len(), now_slot);
-                    }
 
-                    now_slot += 1;
+                        now_slot += 1;
+                    }
                 }
             }
         }
@@ -1185,14 +1196,12 @@ impl RSPaxosReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
-            PeerMsg::Reconstruct { slot } => {
-                self.handle_msg_reconstruct(peer, slot)
+            PeerMsg::Reconstruct { slots } => {
+                self.handle_msg_reconstruct(peer, slots)
+            }
+            PeerMsg::ReconstructReply { slots_data } => {
+                self.handle_msg_reconstruct_reply(peer, slots_data)
             }
-            PeerMsg::ReconstructReply {
-                slot,
-                ballot,
-                reqs_cw,
-            } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw),
             PeerMsg::Heartbeat { ballot, exec_bar } => {
                 self.heard_heartbeat(peer, ballot, exec_bar)
             }
@@ -1284,6 +1293,7 @@ impl RSPaxosReplica {
         self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
         self.bal_max_seen = self.bal_prep_sent;
 
+        let mut recon_slots = Vec::new();
         for (slot, inst) in self
             .insts
             .iter_mut()
@@ -1333,13 +1343,18 @@ impl RSPaxosReplica {
             if inst.status == Status::Committed
                 && inst.reqs_cw.avail_shards() < self.quorum_cnt
             {
-                self.transport_hub
-                    .bcast_msg(PeerMsg::Reconstruct { slot }, None)?;
-                pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}",
-                                   slot, inst.bal, inst.reqs_cw.avail_shards_map());
+                recon_slots.push(slot);
             }
         }
 
+        // send reconstruction read messages in chunks
+        for chunk in recon_slots.chunks(self.config.recon_chunk_size) {
+            let slots = chunk.to_vec();
+            let num_slots = slots.len();
+            self.transport_hub
+                .bcast_msg(PeerMsg::Reconstruct { slots }, None)?;
+            pf_trace!(self.id; "broadcast Reconstruct messages for {} slots", num_slots);
+        }
         Ok(())
     }
 
@@ -1951,7 +1966,7 @@ impl GenericReplica for RSPaxosReplica {
                                     hb_hear_timeout_min, hb_hear_timeout_max,
                                     hb_send_interval_ms,
                                     snapshot_path, snapshot_interval_s,
-                                    fault_tolerance,
+                                    fault_tolerance, recon_chunk_size,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
         if config.batch_interval_us == 0 {
@@ -1982,6 +1997,13 @@ impl GenericReplica for RSPaxosReplica {
                 config.hb_send_interval_ms
             );
         }
+        if config.recon_chunk_size == 0 {
+            return logged_err!(
+                id;
+                "invalid config.recon_chunk_size '{}'",
+                config.recon_chunk_size
+            );
+        }
 
         // setup state machine module
         let state_machine = StateMachine::new_and_setup(id).await?;
diff --git a/src/server/transport.rs b/src/server/transport.rs
index ba0e1e8b..e3b464f4 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -227,11 +227,12 @@ where
                     .map_err(|e| SummersetError(e.to_string()))?;
             }
             None => {
-                pf_error!(
-                    self.me;
-                    "peer ID {} not found among connected ones",
-                    peer
-                );
+                // NOTE: commented out to avoid spurious error messages
+                // pf_error!(
+                //     self.me;
+                //     "peer ID {} not found among connected ones",
+                //     peer
+                // );
             }
         }
 

From 8751839fd57fbf2156ef3bcfba09dc9f56266927 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 5 Oct 2023 13:54:59 -0500
Subject: [PATCH 76/89] update counstraint boundary figure

---
 models/plot_cstr_bounds.py | 50 ++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/models/plot_cstr_bounds.py b/models/plot_cstr_bounds.py
index 30a6335c..a5e87eea 100644
--- a/models/plot_cstr_bounds.py
+++ b/models/plot_cstr_bounds.py
@@ -62,57 +62,59 @@ def plot_cstr_bound(idx, cluster_size):
         label="Crossword configs",
         zorder=20,
     )
-    plt.vlines(m, ymin=m, ymax=m + 1.5, linestyles="-", color=line_color, zorder=20)
-    plt.vlines(n, ymin=1, ymax=2.5, linestyles="-", color=line_color, zorder=20)
+    plt.vlines(m, ymin=m, ymax=m + 1.4, linestyles="-", color=line_color, zorder=20)
+    plt.vlines(n, ymin=1, ymax=2.4, linestyles="-", color=line_color, zorder=20)
 
     # correct region
     xs = [m, m, n, n]
-    ys = [m, m + 1, 2, 1]
+    ys = [m, m + 1.7, 2.7, 1]
     plt.fill(xs, ys, color=fill_color, label="Region of fault-tolerance=f", zorder=0)
 
-    # unused x-axis range
+    # unused x-axis ranges
+    xs = [0.42, m - 0.5, m - 0.8, 0.12]
+    ys = [0.3, 0.3, 0, 0]
+    plt.fill(xs, ys, hatch="///", fill=False, linewidth=0, zorder=10)
     if cluster_size < CLUSTER_SIZES[-1]:
-        xs = [n + 0.9, X_TICKS[-1] + 0.35, X_TICKS[-1] + 0.35, n + 0.8]
-        ys = [0.3, 0.3, 0, 0]
-        plt.fill(
-            xs, ys, hatch="///", fill=False, edgecolor=None, linewidth=0, zorder=10
-        )
+        xs = [n + 1.1, X_TICKS[-1] + 0.4, X_TICKS[-1] + 0.1, n + 0.8]
+        plt.fill(xs, ys, hatch="///", fill=False, linewidth=0, zorder=10)
 
     # latency & throughput optimized arrows
     plt.arrow(
-        m + 0.3,
-        m + 1.7,
-        -0.9,
-        0.9,
+        m + 0.1,
+        m + 2.4,
+        -1.3,
+        0,
         linewidth=1,
         color="dimgray",
         length_includes_head=True,
         head_width=0.3,
         overhang=0.5,
+        clip_on=False,
         label="Tradeoff decisions",
     )
     plt.text(
-        m + 0.18 if n <= 5 else m + 0.5 if n == 9 else m + 0.4,
-        m + 2.78 if n <= 5 else m + 2.0 if n == 9 else m + 2.4,
+        m + 0.3 if n < 9 else m + 0.6,
+        m + 2.5 if n < 9 else m + 2.2,
         "Lat.\noptim.",
         horizontalalignment="left",
         verticalalignment="center",
         color="dimgray",
     )
     plt.arrow(
-        n - 0.3,
-        3.3,
-        0.9,
-        -0.9,
+        n + 1,
+        2,
+        0,
+        -1.3,
         linewidth=1,
         color="dimgray",
         length_includes_head=True,
         head_width=0.3,
         overhang=0.5,
+        clip_on=False,
     )
     plt.text(
-        n + 0.8 if n <= 5 else n + 0.0 if n == 9 else n + 0.4,
-        1 + 1.5 if n <= 5 else 1 + 2.9 if n == 9 else 1 + 2.6,
+        n + 1.3 if n < 7 else n + 0.4,
+        1 + 1.1 if n < 7 else 1 + 2.1,
         "Tput.\noptim.",
         horizontalalignment="left",
         verticalalignment="center",
@@ -125,7 +127,9 @@ def plot_cstr_bound(idx, cluster_size):
 
     plt.xlim((0, X_TICKS[-1] + 0.7))
     plt.ylim((0, Y_TICKS[-1] + 2.7))
-    plt.xticks(X_TICKS[:cluster_size], list(map(str, X_TICKS))[:cluster_size])
+    plt.xticks(
+        X_TICKS[m - 1 : cluster_size], list(map(str, X_TICKS))[m - 1 : cluster_size]
+    )
     plt.yticks(Y_TICKS, list(map(str, Y_TICKS)))
 
     plt.xlabel("|Quorum|", loc="right")
@@ -163,7 +167,7 @@ def make_legend_arrow(
             color="dimgray",
             length_includes_head=True,
             head_width=0.6 * height,
-            overhang=0.3,
+            overhang=0.2,
         )
 
     def make_legend_polygon(

From e7a4e96464a4afc1b2949a24dd1016b4d3535a36 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 5 Oct 2023 16:22:45 -0500
Subject: [PATCH 77/89] make follower gossiping optimal

---
 src/protocols/crossword.rs | 177 ++++++++++++++++++++++++-------------
 src/utils/bitmap.rs        |  10 +++
 src/utils/rscoding.rs      |  15 ++--
 3 files changed, 136 insertions(+), 66 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 1963f016..c86549c7 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -96,10 +96,10 @@ impl Default for ReplicaConfigCrossword {
             hb_send_interval_ms: 50,
             snapshot_path: "/tmp/summerset.rs_paxos.snap".into(),
             snapshot_interval_s: 0,
-            gossip_timeout_min: 100,
-            gossip_timeout_max: 300,
+            gossip_timeout_min: 1200,
+            gossip_timeout_max: 1800,
             fault_tolerance: 0,
-            recon_chunk_size: 1000,
+            recon_chunk_size: 2000,
             shards_per_replica: 1,
             perf_storage_a: 0,
             perf_storage_b: 0,
@@ -146,6 +146,9 @@ struct LeaderBookkeeping {
 struct ReplicaBookkeeping {
     /// Source leader replica ID for replyiing to Prepares and Accepts.
     source: ReplicaId,
+
+    /// Have I tried gossiping for this instance at least once?
+    gossip_tried: bool,
 }
 
 /// In-memory instance containing a (possibly partial) commands batch.
@@ -436,14 +439,84 @@ impl CrosswordReplica {
         slot: usize,
         id: ReplicaId,
         population: u8,
-        num_shards: u8,
+        shards_per_replica: u8,
     ) -> Vec<u8> {
         let first: u8 = ((id as usize + slot) % population as usize) as u8;
-        (first..(first + num_shards))
+        (first..(first + shards_per_replica))
             .map(|i| (i % population))
             .collect()
     }
 
+    /// TODO: should let leader incorporate assignment metadata in Accept
+    /// messages. With more complex assignment policies, a follower probably
+    /// does not know the assignment.
+    fn gossip_targets_excl(
+        slot: usize,
+        me: ReplicaId,
+        population: u8,
+        quorum_cnt: u8,
+        shards_per_replica: u8,
+        mut avail_shards_map: Bitmap,
+        replica_bk: &mut Option<ReplicaBookkeeping>,
+    ) -> HashMap<ReplicaId, Vec<u8>> {
+        let mut src_peer = me;
+        let mut first_try = false;
+        if let Some(ReplicaBookkeeping {
+            source,
+            gossip_tried,
+        }) = replica_bk
+        {
+            if !*gossip_tried {
+                src_peer = *source;
+                first_try = true;
+                // first try: exclude all parity shards
+                for idx in quorum_cnt..population {
+                    avail_shards_map.set(idx, true).unwrap();
+                }
+                *gossip_tried = true;
+            }
+        }
+
+        // greedily considers my peers, starting from the one with my ID + 1,
+        // until all data shards covered
+        let mut targets_excl = HashMap::new();
+        for p in (me + 1)..(population + me) {
+            let peer = p % population;
+            if peer == src_peer {
+                // skip leader who initially replicated this instance to me
+                continue;
+            }
+
+            if !first_try {
+                // first try probably did not succeed, so do it conservatively
+                targets_excl.insert(peer, avail_shards_map.to_vec());
+            } else {
+                // first try: only ask for a minimum number of data shards
+                let mut useful_shards = Vec::new();
+                for idx in Self::shards_for_replica(
+                    slot,
+                    peer,
+                    population,
+                    shards_per_replica,
+                ) {
+                    if !avail_shards_map.get(idx).unwrap() {
+                        useful_shards.push(idx);
+                    }
+                }
+                // if this peer has data shards which I don't have right now
+                // and I have not asked others for in this round
+                if !useful_shards.is_empty() {
+                    targets_excl.insert(peer, avail_shards_map.to_vec());
+                    for idx in useful_shards {
+                        avail_shards_map.set(idx, true).unwrap();
+                    }
+                }
+            }
+        }
+
+        targets_excl
+    }
+
     /// TODO: make better impl of this.
     fn coverage_under_faults(
         population: u8,
@@ -682,7 +755,7 @@ impl CrosswordReplica {
         } else {
             // on follower replica, finishing the logging of a
             // PrepareBal entry leads to sending back a Prepare reply
-            if let Some(ReplicaBookkeeping { source }) = inst.replica_bk {
+            if let Some(ReplicaBookkeeping { source, .. }) = inst.replica_bk {
                 self.transport_hub.send_msg(
                     PeerMsg::PrepareReply {
                         slot,
@@ -719,7 +792,7 @@ impl CrosswordReplica {
         } else {
             // on follower replica, finishing the logging of an
             // AcceptData entry leads to sending back an Accept reply
-            if let Some(ReplicaBookkeeping { source }) = inst.replica_bk {
+            if let Some(ReplicaBookkeeping { source, .. }) = inst.replica_bk {
                 self.transport_hub.send_msg(
                     PeerMsg::AcceptReply {
                         slot,
@@ -854,7 +927,10 @@ impl CrosswordReplica {
 
             inst.bal = ballot;
             inst.status = Status::Preparing;
-            inst.replica_bk = Some(ReplicaBookkeeping { source: peer });
+            inst.replica_bk = Some(ReplicaBookkeeping {
+                source: peer,
+                gossip_tried: false,
+            });
 
             // update largest ballot seen
             self.bal_max_seen = ballot;
@@ -1037,7 +1113,10 @@ impl CrosswordReplica {
             inst.bal = ballot;
             inst.status = Status::Accepting;
             inst.reqs_cw = reqs_cw;
-            inst.replica_bk = Some(ReplicaBookkeeping { source: peer });
+            inst.replica_bk = Some(ReplicaBookkeeping {
+                source: peer,
+                gossip_tried: false,
+            });
 
             // update largest ballot seen
             self.bal_max_seen = ballot;
@@ -1468,26 +1547,13 @@ impl CrosswordReplica {
             }
 
             // do reconstruction reads for all committed instances that do not
-            // hold enough available shards for reconstruction
+            // hold enough available shards for reconstruction. It would be too
+            // complicated and slow to do the "data shards only" optimization
+            // during fail-over, so just do this conservatively here
             if inst.status == Status::Committed
                 && inst.reqs_cw.avail_shards() < self.quorum_cnt
             {
-                recon_slots.insert(
-                    slot,
-                    inst.reqs_cw
-                        .avail_shards_map()
-                        .iter()
-                        .filter_map(
-                            |(idx, flag)| {
-                                if flag {
-                                    Some(idx)
-                                } else {
-                                    None
-                                }
-                            },
-                        )
-                        .collect(),
-                );
+                recon_slots.insert(slot, inst.reqs_cw.avail_shards_vec());
 
                 // send reconstruction read messages in chunks
                 if recon_slots.len() == self.config.recon_chunk_size {
@@ -1593,11 +1659,10 @@ impl CrosswordReplica {
     /// Triggers gossiping for my missing shards in committed but not-yet-
     /// executed instances: fetch missing shards from peers, preferring
     /// follower peers that hold data shards.
-    // TODO: prefer replicas with original data shards first
     fn trigger_gossiping(&mut self) -> Result<(), SummersetError> {
         // maintain a map from peer ID to send to -> slots_excl to send
         let mut recon_slots: HashMap<ReplicaId, HashMap<usize, Vec<u8>>> =
-            HashMap::new();
+            HashMap::with_capacity(self.population as usize - 1);
         for peer in 0..self.population {
             if peer != self.id {
                 recon_slots.insert(peer, HashMap::new());
@@ -1607,42 +1672,32 @@ impl CrosswordReplica {
         let mut slot_up_to = self.exec_bar;
         for slot in self.exec_bar..(self.start_slot + self.insts.len()) {
             slot_up_to = slot;
-            let inst = &self.insts[slot - self.start_slot];
-            if inst.status >= Status::Executed {
-                continue;
-            } else if inst.status < Status::Committed {
-                break;
+            {
+                let inst = &self.insts[slot - self.start_slot];
+                if inst.status >= Status::Executed {
+                    continue;
+                } else if inst.status < Status::Committed {
+                    break;
+                }
             }
 
-            if inst.reqs_cw.avail_shards() < self.quorum_cnt {
-                for peer in 0..self.population {
-                    if peer == self.id {
-                        continue;
-                    }
-                    if let Some(ReplicaBookkeeping { source }) = inst.replica_bk
-                    {
-                        if peer == source {
-                            // skip leader who initially replicated this instance to me
-                            continue;
-                        }
-                    }
+            let avail_shards_map = self.insts[slot - self.start_slot]
+                .reqs_cw
+                .avail_shards_map();
+            if avail_shards_map.count() < self.quorum_cnt {
+                // decide which peers to ask for which shards from
+                let targets_excl = Self::gossip_targets_excl(
+                    slot,
+                    self.id,
+                    self.population,
+                    self.quorum_cnt,
+                    self.config.shards_per_replica,
+                    avail_shards_map,
+                    &mut self.insts[slot - self.start_slot].replica_bk,
+                );
 
-                    recon_slots.get_mut(&peer).unwrap().insert(
-                        slot,
-                        inst.reqs_cw
-                            .avail_shards_map()
-                            .iter()
-                            .filter_map(
-                                |(idx, flag)| {
-                                    if flag {
-                                        Some(idx)
-                                    } else {
-                                        None
-                                    }
-                                },
-                            )
-                            .collect(),
-                    );
+                for (peer, exclude) in targets_excl {
+                    recon_slots.get_mut(&peer).unwrap().insert(slot, exclude);
 
                     // send reconstruction read messages in chunks
                     if recon_slots[&peer].len() == self.config.recon_chunk_size
diff --git a/src/utils/bitmap.rs b/src/utils/bitmap.rs
index 592ddb0c..5211e9f9 100644
--- a/src/utils/bitmap.rs
+++ b/src/utils/bitmap.rs
@@ -82,6 +82,15 @@ impl Bitmap {
     pub fn iter(&self) -> BitmapIter {
         BitmapIter { map: self, idx: 0 }
     }
+
+    /// Convenience method for converting the bitmap to a vec of indexes where
+    /// the flag is true.
+    #[inline]
+    pub fn to_vec(&self) -> Vec<u8> {
+        self.iter()
+            .filter_map(|(idx, flag)| if flag { Some(idx) } else { None })
+            .collect()
+    }
 }
 
 /// Iterator over `Bitmap`, yielding `(id, bit)` pairs.
@@ -175,5 +184,6 @@ mod bitmap_tests {
         for (id, flag) in map.iter() {
             assert_eq!(ref_map[id as usize], flag);
         }
+        assert_eq!(map.to_vec(), [0, 1, 3, 4]);
     }
 }
diff --git a/src/utils/rscoding.rs b/src/utils/rscoding.rs
index c8461c26..35fdc97c 100644
--- a/src/utils/rscoding.rs
+++ b/src/utils/rscoding.rs
@@ -305,15 +305,20 @@ where
         self.shards.iter().filter(|s| s.is_some()).count() as u8
     }
 
-    /// Gets a bitmap of available shard indexes set true.
+    /// Gets a vec of available shard indexes.
     #[inline]
-    pub fn avail_shards_map(&self) -> Bitmap {
-        let ones: Vec<u8> = self
-            .shards
+    pub fn avail_shards_vec(&self) -> Vec<u8> {
+        self.shards
             .iter()
             .enumerate()
             .filter_map(|(i, s)| if s.is_some() { Some(i as u8) } else { None })
-            .collect();
+            .collect()
+    }
+
+    /// Gets a bitmap of available shard indexes set true.
+    #[inline]
+    pub fn avail_shards_map(&self) -> Bitmap {
+        let ones = self.avail_shards_vec();
         Bitmap::from(self.num_shards(), ones)
     }
 

From ca6f52e2766c86b9fe75cdaaa7912d228170ab1f Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Thu, 5 Oct 2023 23:33:50 -0500
Subject: [PATCH 78/89] staging progress on peer health tracking

---
 src/manager/clusman.rs               |  55 +++++----
 src/protocols/multipaxos.rs          | 137 ++++++++++++++++++----
 src/utils/error.rs                   |   2 +
 summerset_client/src/clients/repl.rs | 167 +++++++++++++++++++++------
 4 files changed, 277 insertions(+), 84 deletions(-)

diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs
index 89f2700d..e3bc0103 100644
--- a/src/manager/clusman.rs
+++ b/src/manager/clusman.rs
@@ -408,9 +408,13 @@ impl ClusterManager {
             self.server_info.get_mut(&s).unwrap().is_paused = true;
 
             // wait for dummy reply
-            let (_, reply) = self.server_reigner.recv_ctrl().await?;
-            if reply != CtrlMsg::PauseReply {
-                return logged_err!("m"; "unexpected reply type received");
+            loop {
+                let (server, reply) = self.server_reigner.recv_ctrl().await?;
+                if server != s || reply != CtrlMsg::PauseReply {
+                    self.handle_ctrl_msg(server, reply).await?;
+                } else {
+                    break;
+                }
             }
 
             pause_done.insert(s);
@@ -444,9 +448,13 @@ impl ClusterManager {
             self.server_reigner.send_ctrl(CtrlMsg::Resume, s)?;
 
             // wait for dummy reply
-            let (_, reply) = self.server_reigner.recv_ctrl().await?;
-            if reply != CtrlMsg::ResumeReply {
-                return logged_err!("m"; "unexpected reply type received");
+            loop {
+                let (server, reply) = self.server_reigner.recv_ctrl().await?;
+                if server != s || reply != CtrlMsg::ResumeReply {
+                    self.handle_ctrl_msg(server, reply).await?;
+                } else {
+                    break;
+                }
             }
 
             // clear the is_paused flag
@@ -484,22 +492,27 @@ impl ClusterManager {
             self.server_reigner.send_ctrl(CtrlMsg::TakeSnapshot, s)?;
 
             // wait for reply
-            let (_, reply) = self.server_reigner.recv_ctrl().await?;
-            if let CtrlMsg::SnapshotUpTo { new_start } = reply {
-                // update the log start index info
-                assert!(self.server_info.contains_key(&s));
-                if new_start < self.server_info[&s].start_slot {
-                    return logged_err!("m"; "server {} snapshot up to {} < {}",
-                                            s, new_start,
-                                            self.server_info[&s].start_slot);
-                } else {
-                    self.server_info.get_mut(&s).unwrap().start_slot =
-                        new_start;
-                }
+            loop {
+                let (server, reply) = self.server_reigner.recv_ctrl().await?;
+                match reply {
+                    CtrlMsg::SnapshotUpTo { new_start } if server == s => {
+                        // update the log start index info
+                        assert!(self.server_info.contains_key(&s));
+                        if new_start < self.server_info[&s].start_slot {
+                            return logged_err!("m"; "server {} snapshot up to {} < {}",
+                                                    s, new_start,
+                                                    self.server_info[&s].start_slot);
+                        } else {
+                            self.server_info.get_mut(&s).unwrap().start_slot =
+                                new_start;
+                        }
+
+                        snapshot_up_to.insert(s, new_start);
+                        break;
+                    }
 
-                snapshot_up_to.insert(s, new_start);
-            } else {
-                return logged_err!("m"; "unexpected reply type received");
+                    _ => self.handle_ctrl_msg(server, reply).await?,
+                }
             }
         }
 
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index fbcfc682..f1899544 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -259,14 +259,21 @@ pub struct MultiPaxosReplica {
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
+    /// Who do I think is the effective leader of the cluster right now?
+    leader: Option<ReplicaId>,
+
     /// Timer for hearing heartbeat from leader.
     hb_hear_timer: Timer,
 
     /// Interval for sending heartbeat to followers.
     hb_send_interval: Interval,
 
-    /// Do I think I am the leader?
-    is_leader: bool,
+    /// Heartbeat reply counters for approximate detection of follower health.
+    /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition).
+    hb_reply_cnts: HashMap<ReplicaId, (u64, u64, u8)>,
+
+    /// Approximate health status tracking of peer replicas.
+    peer_alive: Bitmap,
 
     /// In-memory log of instances.
     insts: Vec<Instance>,
@@ -302,6 +309,12 @@ pub struct MultiPaxosReplica {
 
 // MultiPaxosReplica common helpers
 impl MultiPaxosReplica {
+    /// Do I think I am the current effective leader?
+    #[inline]
+    fn is_leader(&self) -> bool {
+        self.leader == Some(self.id)
+    }
+
     /// Create an empty null instance.
     #[inline]
     fn null_instance(&self) -> Instance {
@@ -396,21 +409,26 @@ impl MultiPaxosReplica {
         pf_debug!(self.id; "got request batch of size {}", batch_size);
 
         // if I'm not a leader, ignore client requests
-        if !self.is_leader {
+        if !self.is_leader() {
             for (client, req) in req_batch {
                 if let ApiRequest::Req { id: req_id, .. } = req {
-                    // tell the client to try on the next replica
-                    let next_replica = (self.id + 1) % self.population;
+                    // tell the client to try on known leader or just the
+                    // next ID replica
+                    let target = if let Some(peer) = self.leader {
+                        peer
+                    } else {
+                        (self.id + 1) % self.population
+                    };
                     self.external_api.send_reply(
                         ApiReply::Reply {
                             id: req_id,
                             result: None,
-                            redirect: Some(next_replica),
+                            redirect: Some(target),
                         },
                         client,
                     )?;
                     pf_trace!(self.id; "redirected client {} to replica {}",
-                                       client, next_replica);
+                                       client, target);
                 }
             }
             return Ok(());
@@ -532,7 +550,7 @@ impl MultiPaxosReplica {
             None
         };
 
-        if self.is_leader {
+        if self.is_leader() {
             // on leader, finishing the logging of a PrepareBal entry
             // is equivalent to receiving a Prepare reply from myself
             // (as an acceptor role)
@@ -569,7 +587,7 @@ impl MultiPaxosReplica {
                            slot, self.insts[slot - self.start_slot].bal);
         let inst = &self.insts[slot - self.start_slot];
 
-        if self.is_leader {
+        if self.is_leader() {
             // on leader, finishing the logging of an AcceptData entry
             // is equivalent to receiving an Accept reply from myself
             // (as an acceptor role)
@@ -738,10 +756,11 @@ impl MultiPaxosReplica {
         // if ballot is what I'm currently waiting on for Prepare replies:
         if ballot == self.bal_prep_sent {
             assert!(slot < self.start_slot + self.insts.len());
+            let is_leader = self.is_leader();
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if !self.is_leader
+            if !is_leader
                 || (inst.status != Status::Preparing)
                 || (ballot < inst.bal)
             {
@@ -871,10 +890,11 @@ impl MultiPaxosReplica {
         // if ballot is what I'm currently waiting on for Accept replies:
         if ballot == self.bal_prepared {
             assert!(slot < self.start_slot + self.insts.len());
+            let is_leader = self.is_leader();
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if !self.is_leader
+            if !is_leader
                 || (inst.status != Status::Accepting)
                 || (ballot < inst.bal)
             {
@@ -1055,16 +1075,25 @@ impl MultiPaxosReplica {
     /// Becomes a leader, sends self-initiated Prepare messages to followers
     /// for all in-progress instances, and starts broadcasting heartbeats.
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
-        if self.is_leader {
+        if self.is_leader() {
             return Ok(());
+        } else if let Some(peer) = self.leader {
+            // mark old leader as dead
+            if self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, false)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
         }
 
-        self.is_leader = true; // this starts broadcasting heartbeats
+        self.leader = Some(self.id); // this starts broadcasting heartbeats
         self.control_hub
             .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
         pf_info!(self.id; "becoming a leader...");
 
-        // broadcast a heartbeat right now
+        // clear peers' heartbeat reply counters, and broadcast a heartbeat now
+        for cnts in self.hb_reply_cnts.values_mut() {
+            *cnts = (1, 0, 0);
+        }
         self.bcast_heartbeats()?;
 
         // make a greater ballot number and invalidate all in-progress instances
@@ -1129,6 +1158,33 @@ impl MultiPaxosReplica {
             },
             None,
         )?;
+
+        // update max heartbeat reply counters and their repetitions seen
+        for (&peer, cnts) in self.hb_reply_cnts.iter_mut() {
+            if cnts.0 > cnts.1 {
+                // more hb replies have been received from this peer; it is
+                // probably alive
+                cnts.1 = cnts.0;
+                cnts.2 = 0;
+            } else {
+                // did not receive hb reply from this peer at least for the
+                // last sent hb from me; increment repetition count
+                cnts.2 += 1;
+                let repeat_threshold = (self.config.hb_hear_timeout_min
+                    / self.config.hb_send_interval_ms)
+                    as u8;
+                if cnts.2 > repeat_threshold {
+                    // did not receive hb reply from this peer for too many
+                    // past hbs sent from me; this peer is probably dead
+                    if self.peer_alive.get(peer)? {
+                        self.peer_alive.set(peer, false)?;
+                        pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+                    }
+                }
+            }
+        }
+
+        // I also heard this heartbeat from myself
         self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
 
         // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
@@ -1143,6 +1199,7 @@ impl MultiPaxosReplica {
         );
 
         // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms);
+        self.hb_hear_timer.cancel()?;
         self.hb_hear_timer
             .kickoff(Duration::from_millis(timeout_ms))?;
         Ok(())
@@ -1153,10 +1210,18 @@ impl MultiPaxosReplica {
     /// leader status if I currently think I'm a leader.
     fn heard_heartbeat(
         &mut self,
-        _peer: ReplicaId,
+        peer: ReplicaId,
         ballot: Ballot,
         exec_bar: usize,
     ) -> Result<(), SummersetError> {
+        if peer != self.id {
+            self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1;
+            if !self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, true)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
+        }
+
         // ignore outdated heartbeats and those from peers with exec_bar < mine
         if ballot < self.bal_max_seen || exec_bar < self.exec_bar {
             return Ok(());
@@ -1165,12 +1230,28 @@ impl MultiPaxosReplica {
         // reset hearing timer
         self.kickoff_hb_hear_timer()?;
 
-        // clear my leader status if it carries a higher ballot number
-        if self.is_leader && ballot > self.bal_max_seen {
-            self.is_leader = false;
-            self.control_hub
-                .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
-            pf_info!(self.id; "no longer a leader...");
+        if peer != self.id {
+            // reply back with a Heartbeat message
+            self.transport_hub.send_msg(
+                PeerMsg::Heartbeat {
+                    ballot: self.bal_max_seen,
+                    exec_bar: self.exec_bar,
+                },
+                peer,
+            )?;
+
+            // if the peer has made a higher ballot number
+            if ballot > self.bal_max_seen {
+                // clear my leader status if I was one
+                if self.is_leader() && ballot > self.bal_max_seen {
+                    self.control_hub
+                        .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
+                    pf_info!(self.id; "no longer a leader...");
+                }
+
+                // set this peer to be the believed leader
+                self.leader = Some(peer);
+            }
         }
 
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
@@ -1537,7 +1618,7 @@ impl MultiPaxosReplica {
         }
 
         // collect and dump all Puts in executed instances
-        if self.is_leader {
+        if self.is_leader() {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
@@ -1570,7 +1651,7 @@ impl MultiPaxosReplica {
         self.start_slot = self.exec_bar;
 
         // discarding everything older than start_slot in WAL log
-        if self.is_leader {
+        if self.is_leader() {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
@@ -1824,6 +1905,10 @@ impl GenericReplica for MultiPaxosReplica {
         ));
         snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
+        let hb_reply_cnts = (0..population)
+            .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) })
+            .collect();
+
         Ok(MultiPaxosReplica {
             id,
             population,
@@ -1837,9 +1922,11 @@ impl GenericReplica for MultiPaxosReplica {
             storage_hub,
             snapshot_hub,
             transport_hub,
+            leader: None,
             hb_hear_timer: Timer::new(),
             hb_send_interval,
-            is_leader: false,
+            hb_reply_cnts,
+            peer_alive: Bitmap::new(population, true),
             insts: vec![],
             start_slot: 0,
             snapshot_interval,
@@ -1927,7 +2014,7 @@ impl GenericReplica for MultiPaxosReplica {
                 },
 
                 // leader sending heartbeat
-                _ = self.hb_send_interval.tick(), if !paused && self.is_leader => {
+                _ = self.hb_send_interval.tick(), if !paused && self.is_leader() => {
                     if let Err(e) = self.bcast_heartbeats() {
                         pf_error!(self.id; "error broadcasting heartbeats: {}", e);
                     }
diff --git a/src/utils/error.rs b/src/utils/error.rs
index 0e73dccb..6c0907a0 100644
--- a/src/utils/error.rs
+++ b/src/utils/error.rs
@@ -3,6 +3,7 @@
 use std::fmt;
 use std::io;
 use std::net;
+use std::num;
 
 use crate::server::ReplicaId;
 
@@ -30,6 +31,7 @@ macro_rules! impl_from_error {
 }
 
 impl_from_error!(io::Error);
+impl_from_error!(num::ParseIntError);
 impl_from_error!(net::AddrParseError);
 impl_from_error!(rmp_serde::encode::Error);
 impl_from_error!(rmp_serde::decode::Error);
diff --git a/summerset_client/src/clients/repl.rs b/summerset_client/src/clients/repl.rs
index 09e4f330..88e0cfbb 100644
--- a/summerset_client/src/clients/repl.rs
+++ b/summerset_client/src/clients/repl.rs
@@ -1,6 +1,8 @@
 //! Interactive REPL-style command-line interface client.
 
+use std::collections::HashSet;
 use std::io::{self, Write};
+use std::str::SplitWhitespace;
 
 use crate::drivers::{DriverReply, DriverClosedLoop};
 
@@ -8,7 +10,9 @@ use color_print::{cprint, cprintln};
 
 use tokio::time::Duration;
 
-use summerset::{GenericEndpoint, Command, SummersetError};
+use summerset::{
+    ReplicaId, GenericEndpoint, Command, CtrlRequest, CtrlReply, SummersetError,
+};
 
 /// Prompt string at the start of line.
 const PROMPT: &str = ">>>>> ";
@@ -24,6 +28,9 @@ enum ReplCommand {
     /// Print help message.
     PrintHelp,
 
+    /// Control request to the manager.
+    Control(CtrlRequest),
+
     /// Client exit.
     Exit,
 
@@ -54,28 +61,61 @@ impl ClientRepl {
     }
 
     /// Prints the prompt string.
-    fn print_prompt(&mut self) {
+    #[inline]
+    fn print_prompt() {
         cprint!("<bright-yellow>{}</>", PROMPT);
         io::stdout().flush().unwrap();
     }
 
     /// Prints (optionally) an error message and the help message.
-    fn print_help(&mut self, err: Option<&SummersetError>) {
+    fn print_help(err: Option<&SummersetError>) {
         if let Some(e) = err {
             cprintln!("<bright-red>✗</> {}", e);
         }
-        println!("HELP: Supported commands are:");
-        println!("        get <key>");
-        println!("        put <key> <value>");
-        println!("        reconnect");
-        println!("        help");
-        println!("        exit");
+        println!("HELP: Supported normal commands are:");
+        println!("          get <key>");
+        println!("          put <key> <value>");
+        println!("          help");
+        println!("          exit");
+        println!("      Commands for control/testing:");
+        println!("          reconnect");
+        println!("          reset [servers]");
+        println!("          pause [servers]");
+        println!("          resume [servers]");
+        println!("          snapshot [servers]");
         println!(
             "      Keys and values currently cannot contain any whitespaces"
         );
         io::stdout().flush().unwrap();
     }
 
+    /// Expect to get the next segment string from parsed segs.
+    #[inline]
+    fn expect_next_seg<'s>(
+        segs: &mut SplitWhitespace<'s>,
+    ) -> Result<&'s str, SummersetError> {
+        if let Some(seg) = segs.next() {
+            Ok(seg)
+        } else {
+            let err = SummersetError("not enough args".into());
+            Self::print_help(Some(&err));
+            Err(err)
+        }
+    }
+
+    /// Drain all of the remaining segments into a hash set and interpret as
+    /// replica IDs.
+    #[inline]
+    fn drain_server_ids(
+        segs: &mut SplitWhitespace,
+    ) -> Result<HashSet<ReplicaId>, SummersetError> {
+        let mut servers = HashSet::new();
+        for seg in segs {
+            servers.insert(seg.parse::<ReplicaId>()?);
+        }
+        Ok(servers)
+    }
+
     /// Reads in user input and parses into a command.
     fn read_command(&mut self) -> Result<ReplCommand, SummersetError> {
         self.input_buf.clear();
@@ -98,36 +138,18 @@ impl ClientRepl {
 
         match &cmd_type.unwrap().to_lowercase()[..] {
             "get" => {
-                let key = segs.next();
-                if key.is_none() {
-                    let err = SummersetError("not enough args".into());
-                    self.print_help(Some(&err));
-                    return Err(err);
-                }
-
-                // keys and values are kept as-is, no case conversions
-                Ok(ReplCommand::Normal(Command::Get {
-                    key: key.unwrap().into(),
-                }))
+                // keys are kept as-is, no case conversions
+                let key = Self::expect_next_seg(&mut segs)?;
+                Ok(ReplCommand::Normal(Command::Get { key: key.into() }))
             }
 
             "put" => {
-                let key = segs.next();
-                if key.is_none() {
-                    let err = SummersetError("not enough args".into());
-                    self.print_help(Some(&err));
-                    return Err(err);
-                }
-                let value = segs.next();
-                if value.is_none() {
-                    let err = SummersetError("not enough args".into());
-                    self.print_help(Some(&err));
-                    return Err(err);
-                }
-
+                // keys and values are kept as-is, no case conversions
+                let key = Self::expect_next_seg(&mut segs)?;
+                let value = Self::expect_next_seg(&mut segs)?;
                 Ok(ReplCommand::Normal(Command::Put {
-                    key: key.unwrap().into(),
-                    value: value.unwrap().into(),
+                    key: key.into(),
+                    value: value.into(),
                 }))
             }
 
@@ -135,6 +157,29 @@ impl ClientRepl {
 
             "reconnect" => Ok(ReplCommand::Reconnect),
 
+            "reset" => {
+                let servers = Self::drain_server_ids(&mut segs)?;
+                Ok(ReplCommand::Control(CtrlRequest::ResetServers {
+                    servers,
+                    durable: true,
+                }))
+            }
+
+            "pause" => {
+                let servers = Self::drain_server_ids(&mut segs)?;
+                Ok(ReplCommand::Control(CtrlRequest::PauseServers { servers }))
+            }
+
+            "resume" => {
+                let servers = Self::drain_server_ids(&mut segs)?;
+                Ok(ReplCommand::Control(CtrlRequest::ResumeServers { servers }))
+            }
+
+            "snapshot" => {
+                let servers = Self::drain_server_ids(&mut segs)?;
+                Ok(ReplCommand::Control(CtrlRequest::TakeSnapshot { servers }))
+            }
+
             "exit" => Ok(ReplCommand::Exit),
 
             _ => {
@@ -142,7 +187,7 @@ impl ClientRepl {
                     "unrecognized command: {}",
                     cmd_type.unwrap()
                 ));
-                self.print_help(Some(&err));
+                Self::print_help(Some(&err));
                 Err(err)
             }
         }
@@ -200,9 +245,49 @@ impl ClientRepl {
         io::stdout().flush().unwrap();
     }
 
+    /// Makes a control request to the manager and wait for the reply.
+    async fn make_ctrl_req(
+        &mut self,
+        req: CtrlRequest,
+    ) -> Result<CtrlReply, SummersetError> {
+        let mut sent = self.driver.ctrl_stub().send_req(Some(&req))?;
+        while !sent {
+            sent = self.driver.ctrl_stub().send_req(None)?;
+        }
+        self.driver.ctrl_stub().recv_reply().await
+    }
+
+    /// Prints control request reply.
+    fn print_ctrl_reply(&mut self, reply: CtrlReply) {
+        match reply {
+            CtrlReply::ResetServers { servers } => {
+                cprintln!("<bright-blue>#</> reset servers {:?}", servers);
+            }
+
+            CtrlReply::PauseServers { servers } => {
+                cprintln!("<bright-blue>#</> paused servers {:?}", servers);
+            }
+
+            CtrlReply::ResumeServers { servers } => {
+                cprintln!("<bright-blue>#</> resumed servers {:?}", servers);
+            }
+
+            CtrlReply::TakeSnapshot { snapshot_up_to } => {
+                cprintln!(
+                    "<bright-blue>#</> servers snapshot up to {:?}",
+                    snapshot_up_to
+                );
+            }
+
+            _ => {
+                cprintln!("<bright-red>✗</> unexpected ctrl reply type");
+            }
+        }
+    }
+
     /// One iteration of the REPL loop.
     async fn iter(&mut self) -> Result<bool, SummersetError> {
-        self.print_prompt();
+        Self::print_prompt();
 
         let cmd = self.read_command()?;
         match cmd {
@@ -221,7 +306,7 @@ impl ClientRepl {
             }
 
             ReplCommand::PrintHelp => {
-                self.print_help(None);
+                Self::print_help(None);
                 Ok(true)
             }
 
@@ -230,6 +315,12 @@ impl ClientRepl {
                 self.print_result(result);
                 Ok(true)
             }
+
+            ReplCommand::Control(req) => {
+                let reply = self.make_ctrl_req(req).await?;
+                self.print_ctrl_reply(reply);
+                Ok(true)
+            }
         }
     }
 

From 5b1c107aa51db3f084c2881cccd050acda8013f9 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Fri, 6 Oct 2023 00:06:01 -0500
Subject: [PATCH 79/89] finish peer health tracking

---
 src/protocols/crossword.rs  | 141 +++++++++++++++++++++++++++++-------
 src/protocols/multipaxos.rs |   8 +-
 src/protocols/rs_paxos.rs   | 139 ++++++++++++++++++++++++++++-------
 3 files changed, 234 insertions(+), 54 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index c86549c7..37e82ba8 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -294,14 +294,21 @@ pub struct CrosswordReplica {
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
+    /// Who do I think is the effective leader of the cluster right now?
+    leader: Option<ReplicaId>,
+
     /// Timer for hearing heartbeat from leader.
     hb_hear_timer: Timer,
 
     /// Interval for sending heartbeat to followers.
     hb_send_interval: Interval,
 
-    /// Do I think I am the leader?
-    is_leader: bool,
+    /// Heartbeat reply counters for approximate detection of follower health.
+    /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition).
+    hb_reply_cnts: HashMap<ReplicaId, (u64, u64, u8)>,
+
+    /// Approximate health status tracking of peer replicas.
+    peer_alive: Bitmap,
 
     /// In-memory log of instances.
     insts: Vec<Instance>,
@@ -343,6 +350,12 @@ pub struct CrosswordReplica {
 
 // CrosswordReplica common helpers
 impl CrosswordReplica {
+    /// Do I think I am the current effective leader?
+    #[inline]
+    fn is_leader(&self) -> bool {
+        self.leader == Some(self.id)
+    }
+
     /// Create an empty null instance.
     #[inline]
     fn null_instance(&self) -> Result<Instance, SummersetError> {
@@ -573,21 +586,26 @@ impl CrosswordReplica {
         pf_debug!(self.id; "got request batch of size {}", batch_size);
 
         // if I'm not a leader, ignore client requests
-        if !self.is_leader {
+        if !self.is_leader() {
             for (client, req) in req_batch {
                 if let ApiRequest::Req { id: req_id, .. } = req {
-                    // tell the client to try on the next replica
-                    let next_replica = (self.id + 1) % self.population;
+                    // tell the client to try on known leader or just the
+                    // next ID replica
+                    let target = if let Some(peer) = self.leader {
+                        peer
+                    } else {
+                        (self.id + 1) % self.population
+                    };
                     self.external_api.send_reply(
                         ApiReply::Reply {
                             id: req_id,
                             result: None,
-                            redirect: Some(next_replica),
+                            redirect: Some(target),
                         },
                         client,
                     )?;
                     pf_trace!(self.id; "redirected client {} to replica {}",
-                                       client, next_replica);
+                                       client, target);
                 }
             }
             return Ok(());
@@ -747,7 +765,7 @@ impl CrosswordReplica {
             None
         };
 
-        if self.is_leader {
+        if self.is_leader() {
             // on leader, finishing the logging of a PrepareBal entry
             // is equivalent to receiving a Prepare reply from myself
             // (as an acceptor role)
@@ -784,7 +802,7 @@ impl CrosswordReplica {
                            slot, self.insts[slot - self.start_slot].bal);
         let inst = &self.insts[slot - self.start_slot];
 
-        if self.is_leader {
+        if self.is_leader() {
             // on leader, finishing the logging of an AcceptData entry
             // is equivalent to receiving an Accept reply from myself
             // (as an acceptor role)
@@ -968,10 +986,11 @@ impl CrosswordReplica {
         // if ballot is what I'm currently waiting on for Prepare replies:
         if ballot == self.bal_prep_sent {
             assert!(slot < self.start_slot + self.insts.len());
+            let is_leader = self.is_leader();
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if !self.is_leader
+            if !is_leader
                 || (inst.status != Status::Preparing)
                 || (ballot < inst.bal)
             {
@@ -1157,10 +1176,11 @@ impl CrosswordReplica {
         // if ballot is what I'm currently waiting on for Accept replies:
         if ballot == self.bal_prepared {
             assert!(slot < self.start_slot + self.insts.len());
+            let is_leader = self.is_leader();
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if !self.is_leader
+            if !is_leader
                 || (inst.status != Status::Accepting)
                 || (ballot < inst.bal)
             {
@@ -1484,16 +1504,25 @@ impl CrosswordReplica {
     /// Becomes a leader, sends self-initiated Prepare messages to followers
     /// for all in-progress instances, and starts broadcasting heartbeats.
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
-        if self.is_leader {
+        if self.is_leader() {
             return Ok(());
+        } else if let Some(peer) = self.leader {
+            // mark old leader as dead
+            if self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, false)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
         }
 
-        self.is_leader = true; // this starts broadcasting heartbeats
+        self.leader = Some(self.id); // this starts broadcasting heartbeats
         self.control_hub
             .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
         pf_info!(self.id; "becoming a leader...");
 
-        // broadcast a heartbeat right now
+        // clear peers' heartbeat reply counters, and broadcast a heartbeat now
+        for cnts in self.hb_reply_cnts.values_mut() {
+            *cnts = (1, 0, 0);
+        }
         self.bcast_heartbeats()?;
 
         // make a greater ballot number and invalidate all in-progress instances
@@ -1592,6 +1621,33 @@ impl CrosswordReplica {
             },
             None,
         )?;
+
+        // update max heartbeat reply counters and their repetitions seen
+        for (&peer, cnts) in self.hb_reply_cnts.iter_mut() {
+            if cnts.0 > cnts.1 {
+                // more hb replies have been received from this peer; it is
+                // probably alive
+                cnts.1 = cnts.0;
+                cnts.2 = 0;
+            } else {
+                // did not receive hb reply from this peer at least for the
+                // last sent hb from me; increment repetition count
+                cnts.2 += 1;
+                let repeat_threshold = (self.config.hb_hear_timeout_min
+                    / self.config.hb_send_interval_ms)
+                    as u8;
+                if cnts.2 > repeat_threshold {
+                    // did not receive hb reply from this peer for too many
+                    // past hbs sent from me; this peer is probably dead
+                    if self.peer_alive.get(peer)? {
+                        self.peer_alive.set(peer, false)?;
+                        pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+                    }
+                }
+            }
+        }
+
+        // I also heard this heartbeat from myself
         self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
 
         // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
@@ -1616,10 +1672,18 @@ impl CrosswordReplica {
     /// leader status if I currently think I'm a leader.
     fn heard_heartbeat(
         &mut self,
-        _peer: ReplicaId,
+        peer: ReplicaId,
         ballot: Ballot,
         exec_bar: usize,
     ) -> Result<(), SummersetError> {
+        if peer != self.id {
+            self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1;
+            if !self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, true)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
+        }
+
         // ignore outdated heartbeats and those from peers with exec_bar < mine
         if ballot < self.bal_max_seen || exec_bar < self.exec_bar {
             return Ok(());
@@ -1628,12 +1692,30 @@ impl CrosswordReplica {
         // reset hearing timer
         self.kickoff_hb_hear_timer()?;
 
-        // clear my leader status if it carries a higher ballot number
-        if self.is_leader && ballot > self.bal_max_seen {
-            self.is_leader = false;
-            self.control_hub
-                .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
-            pf_info!(self.id; "no longer a leader...");
+        if peer != self.id {
+            // reply back with a Heartbeat message
+            self.transport_hub.send_msg(
+                PeerMsg::Heartbeat {
+                    ballot,
+                    exec_bar: self.exec_bar,
+                },
+                peer,
+            )?;
+
+            // if the peer has made a higher ballot number
+            if ballot > self.bal_max_seen {
+                self.bal_max_seen = ballot;
+
+                // clear my leader status if I was one
+                if self.is_leader() {
+                    self.control_hub
+                        .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
+                    pf_info!(self.id; "no longer a leader...");
+                }
+
+                // set this peer to be the believed leader
+                self.leader = Some(peer);
+            }
         }
 
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
@@ -1799,6 +1881,7 @@ impl CrosswordReplica {
         pf_warn!(self.id; "server got resume req");
 
         // reset leader heartbeat timer
+        self.hb_hear_timer.cancel()?;
         self.kickoff_hb_hear_timer()?;
 
         *paused = false;
@@ -2114,7 +2197,7 @@ impl CrosswordReplica {
         }
 
         // collect and dump all Puts in executed instances
-        if self.is_leader {
+        if self.is_leader() {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
@@ -2147,7 +2230,7 @@ impl CrosswordReplica {
         self.start_slot = self.exec_bar;
 
         // discarding everything older than start_slot in WAL log
-        if self.is_leader {
+        if self.is_leader() {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
@@ -2429,6 +2512,10 @@ impl GenericReplica for CrosswordReplica {
         ));
         snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
+        let hb_reply_cnts = (0..population)
+            .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) })
+            .collect();
+
         Ok(CrosswordReplica {
             id,
             population,
@@ -2442,9 +2529,11 @@ impl GenericReplica for CrosswordReplica {
             storage_hub,
             snapshot_hub,
             transport_hub,
+            leader: None,
             hb_hear_timer: Timer::new(),
             hb_send_interval,
-            is_leader: false,
+            hb_reply_cnts,
+            peer_alive: Bitmap::new(population, true),
             insts: vec![],
             start_slot: 0,
             snapshot_interval,
@@ -2537,7 +2626,7 @@ impl GenericReplica for CrosswordReplica {
                 },
 
                 // leader sending heartbeat
-                _ = self.hb_send_interval.tick(), if !paused && self.is_leader => {
+                _ = self.hb_send_interval.tick(), if !paused && self.is_leader() => {
                     if let Err(e) = self.bcast_heartbeats() {
                         pf_error!(self.id; "error broadcasting heartbeats: {}", e);
                     }
@@ -2556,7 +2645,7 @@ impl GenericReplica for CrosswordReplica {
                 },
 
                 // follower gossiping trigger
-                _ = self.gossip_timer.timeout(), if !paused && !self.is_leader => {
+                _ = self.gossip_timer.timeout(), if !paused && !self.is_leader() => {
                     if let Err(e) = self.trigger_gossiping() {
                         pf_error!(self.id; "error triggering gossiping: {}", e);
                     }
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index f1899544..7f1fb73b 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -1199,7 +1199,6 @@ impl MultiPaxosReplica {
         );
 
         // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms);
-        self.hb_hear_timer.cancel()?;
         self.hb_hear_timer
             .kickoff(Duration::from_millis(timeout_ms))?;
         Ok(())
@@ -1234,7 +1233,7 @@ impl MultiPaxosReplica {
             // reply back with a Heartbeat message
             self.transport_hub.send_msg(
                 PeerMsg::Heartbeat {
-                    ballot: self.bal_max_seen,
+                    ballot,
                     exec_bar: self.exec_bar,
                 },
                 peer,
@@ -1242,8 +1241,10 @@ impl MultiPaxosReplica {
 
             // if the peer has made a higher ballot number
             if ballot > self.bal_max_seen {
+                self.bal_max_seen = ballot;
+
                 // clear my leader status if I was one
-                if self.is_leader() && ballot > self.bal_max_seen {
+                if self.is_leader() {
                     self.control_hub
                         .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
                     pf_info!(self.id; "no longer a leader...");
@@ -1320,6 +1321,7 @@ impl MultiPaxosReplica {
         pf_warn!(self.id; "server got resume req");
 
         // reset leader heartbeat timer
+        self.hb_hear_timer.cancel()?;
         self.kickoff_hb_hear_timer()?;
 
         *paused = false;
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 19399535..d6e79486 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -274,14 +274,21 @@ pub struct RSPaxosReplica {
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
+    /// Who do I think is the effective leader of the cluster right now?
+    leader: Option<ReplicaId>,
+
     /// Timer for hearing heartbeat from leader.
     hb_hear_timer: Timer,
 
     /// Interval for sending heartbeat to followers.
     hb_send_interval: Interval,
 
-    /// Do I think I am the leader?
-    is_leader: bool,
+    /// Heartbeat reply counters for approximate detection of follower health.
+    /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition).
+    hb_reply_cnts: HashMap<ReplicaId, (u64, u64, u8)>,
+
+    /// Approximate health status tracking of peer replicas.
+    peer_alive: Bitmap,
 
     /// In-memory log of instances.
     insts: Vec<Instance>,
@@ -320,6 +327,12 @@ pub struct RSPaxosReplica {
 
 // RSPaxosReplica common helpers
 impl RSPaxosReplica {
+    /// Do I think I am the current effective leader?
+    #[inline]
+    fn is_leader(&self) -> bool {
+        self.leader == Some(self.id)
+    }
+
     /// Create an empty null instance.
     #[inline]
     fn null_instance(&self) -> Result<Instance, SummersetError> {
@@ -423,21 +436,26 @@ impl RSPaxosReplica {
         pf_debug!(self.id; "got request batch of size {}", batch_size);
 
         // if I'm not a leader, ignore client requests
-        if !self.is_leader {
+        if !self.is_leader() {
             for (client, req) in req_batch {
                 if let ApiRequest::Req { id: req_id, .. } = req {
-                    // tell the client to try on the next replica
-                    let next_replica = (self.id + 1) % self.population;
+                    // tell the client to try on known leader or just the
+                    // next ID replica
+                    let target = if let Some(peer) = self.leader {
+                        peer
+                    } else {
+                        (self.id + 1) % self.population
+                    };
                     self.external_api.send_reply(
                         ApiReply::Reply {
                             id: req_id,
                             result: None,
-                            redirect: Some(next_replica),
+                            redirect: Some(target),
                         },
                         client,
                     )?;
                     pf_trace!(self.id; "redirected client {} to replica {}",
-                                       client, next_replica);
+                                       client, target);
                 }
             }
             return Ok(());
@@ -580,7 +598,7 @@ impl RSPaxosReplica {
             None
         };
 
-        if self.is_leader {
+        if self.is_leader() {
             // on leader, finishing the logging of a PrepareBal entry
             // is equivalent to receiving a Prepare reply from myself
             // (as an acceptor role)
@@ -617,7 +635,7 @@ impl RSPaxosReplica {
                            slot, self.insts[slot - self.start_slot].bal);
         let inst = &self.insts[slot - self.start_slot];
 
-        if self.is_leader {
+        if self.is_leader() {
             // on leader, finishing the logging of an AcceptData entry
             // is equivalent to receiving an Accept reply from myself
             // (as an acceptor role)
@@ -798,10 +816,11 @@ impl RSPaxosReplica {
         // if ballot is what I'm currently waiting on for Prepare replies:
         if ballot == self.bal_prep_sent {
             assert!(slot < self.start_slot + self.insts.len());
+            let is_leader = self.is_leader();
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if !self.is_leader
+            if !is_leader
                 || (inst.status != Status::Preparing)
                 || (ballot < inst.bal)
             {
@@ -968,10 +987,11 @@ impl RSPaxosReplica {
         // if ballot is what I'm currently waiting on for Accept replies:
         if ballot == self.bal_prepared {
             assert!(slot < self.start_slot + self.insts.len());
+            let is_leader = self.is_leader();
             let inst = &mut self.insts[slot - self.start_slot];
 
             // ignore spurious duplications and outdated replies
-            if !self.is_leader
+            if !is_leader
                 || (inst.status != Status::Accepting)
                 || (ballot < inst.bal)
             {
@@ -1276,16 +1296,25 @@ impl RSPaxosReplica {
     /// Becomes a leader, sends self-initiated Prepare messages to followers
     /// for all in-progress instances, and starts broadcasting heartbeats.
     fn become_a_leader(&mut self) -> Result<(), SummersetError> {
-        if self.is_leader {
+        if self.is_leader() {
             return Ok(());
+        } else if let Some(peer) = self.leader {
+            // mark old leader as dead
+            if self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, false)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
         }
 
-        self.is_leader = true; // this starts broadcasting heartbeats
+        self.leader = Some(self.id); // this starts broadcasting heartbeats
         self.control_hub
             .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?;
         pf_info!(self.id; "becoming a leader...");
 
-        // broadcast a heartbeat right now
+        // clear peers' heartbeat reply counters, and broadcast a heartbeat now
+        for cnts in self.hb_reply_cnts.values_mut() {
+            *cnts = (1, 0, 0);
+        }
         self.bcast_heartbeats()?;
 
         // make a greater ballot number and invalidate all in-progress instances
@@ -1367,6 +1396,33 @@ impl RSPaxosReplica {
             },
             None,
         )?;
+
+        // update max heartbeat reply counters and their repetitions seen
+        for (&peer, cnts) in self.hb_reply_cnts.iter_mut() {
+            if cnts.0 > cnts.1 {
+                // more hb replies have been received from this peer; it is
+                // probably alive
+                cnts.1 = cnts.0;
+                cnts.2 = 0;
+            } else {
+                // did not receive hb reply from this peer at least for the
+                // last sent hb from me; increment repetition count
+                cnts.2 += 1;
+                let repeat_threshold = (self.config.hb_hear_timeout_min
+                    / self.config.hb_send_interval_ms)
+                    as u8;
+                if cnts.2 > repeat_threshold {
+                    // did not receive hb reply from this peer for too many
+                    // past hbs sent from me; this peer is probably dead
+                    if self.peer_alive.get(peer)? {
+                        self.peer_alive.set(peer, false)?;
+                        pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+                    }
+                }
+            }
+        }
+
+        // I also heard this heartbeat from myself
         self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
 
         // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
@@ -1391,10 +1447,18 @@ impl RSPaxosReplica {
     /// leader status if I currently think I'm a leader.
     fn heard_heartbeat(
         &mut self,
-        _peer: ReplicaId,
+        peer: ReplicaId,
         ballot: Ballot,
         exec_bar: usize,
     ) -> Result<(), SummersetError> {
+        if peer != self.id {
+            self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1;
+            if !self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, true)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
+        }
+
         // ignore outdated heartbeats and those from peers with exec_bar < mine
         if ballot < self.bal_max_seen || exec_bar < self.exec_bar {
             return Ok(());
@@ -1403,12 +1467,30 @@ impl RSPaxosReplica {
         // reset hearing timer
         self.kickoff_hb_hear_timer()?;
 
-        // clear my leader status if it carries a higher ballot number
-        if self.is_leader && ballot > self.bal_max_seen {
-            self.is_leader = false;
-            self.control_hub
-                .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
-            pf_info!(self.id; "no longer a leader...");
+        if peer != self.id {
+            // reply back with a Heartbeat message
+            self.transport_hub.send_msg(
+                PeerMsg::Heartbeat {
+                    ballot,
+                    exec_bar: self.exec_bar,
+                },
+                peer,
+            )?;
+
+            // if the peer has made a higher ballot number
+            if ballot > self.bal_max_seen {
+                self.bal_max_seen = ballot;
+
+                // clear my leader status if I was one
+                if self.is_leader() {
+                    self.control_hub
+                        .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?;
+                    pf_info!(self.id; "no longer a leader...");
+                }
+
+                // set this peer to be the believed leader
+                self.leader = Some(peer);
+            }
         }
 
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
@@ -1477,6 +1559,7 @@ impl RSPaxosReplica {
         pf_warn!(self.id; "server got resume req");
 
         // reset leader heartbeat timer
+        self.hb_hear_timer.cancel()?;
         self.kickoff_hb_hear_timer()?;
 
         *paused = false;
@@ -1792,7 +1875,7 @@ impl RSPaxosReplica {
         }
 
         // collect and dump all Puts in executed instances
-        if self.is_leader {
+        if self.is_leader() {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
@@ -1825,7 +1908,7 @@ impl RSPaxosReplica {
         self.start_slot = self.exec_bar;
 
         // discarding everything older than start_slot in WAL log
-        if self.is_leader {
+        if self.is_leader() {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
@@ -2099,6 +2182,10 @@ impl GenericReplica for RSPaxosReplica {
         ));
         snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
 
+        let hb_reply_cnts = (0..population)
+            .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) })
+            .collect();
+
         Ok(RSPaxosReplica {
             id,
             population,
@@ -2112,9 +2199,11 @@ impl GenericReplica for RSPaxosReplica {
             storage_hub,
             snapshot_hub,
             transport_hub,
+            leader: None,
             hb_hear_timer: Timer::new(),
             hb_send_interval,
-            is_leader: false,
+            hb_reply_cnts,
+            peer_alive: Bitmap::new(population, true),
             insts: vec![],
             start_slot: 0,
             snapshot_interval,
@@ -2203,7 +2292,7 @@ impl GenericReplica for RSPaxosReplica {
                 },
 
                 // leader sending heartbeat
-                _ = self.hb_send_interval.tick(), if !paused && self.is_leader => {
+                _ = self.hb_send_interval.tick(), if !paused && self.is_leader() => {
                     if let Err(e) = self.bcast_heartbeats() {
                         pf_error!(self.id; "error broadcasting heartbeats: {}", e);
                     }

From 313dba1eab0e25b97fe9f39164d3dd09bcb61cdf Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Fri, 6 Oct 2023 00:24:29 -0500
Subject: [PATCH 80/89] finish peer health tracking

---
 src/protocols/crossword.rs  | 18 +++++++++++++-----
 src/protocols/multipaxos.rs |  9 +++++++++
 src/protocols/rs_paxos.rs   |  9 +++++++++
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 37e82ba8..554965cd 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -495,15 +495,14 @@ impl CrosswordReplica {
         let mut targets_excl = HashMap::new();
         for p in (me + 1)..(population + me) {
             let peer = p % population;
-            if peer == src_peer {
-                // skip leader who initially replicated this instance to me
-                continue;
-            }
-
             if !first_try {
                 // first try probably did not succeed, so do it conservatively
                 targets_excl.insert(peer, avail_shards_map.to_vec());
             } else {
+                // skip leader who initially replicated this instance to me
+                if peer == src_peer {
+                    continue;
+                }
                 // first try: only ask for a minimum number of data shards
                 let mut useful_shards = Vec::new();
                 for idx in Self::shards_for_replica(
@@ -1945,6 +1944,9 @@ impl CrosswordReplica {
     ) -> Result<(), SummersetError> {
         match entry {
             LogEntry::PrepareBal { slot, ballot } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 // locate instance in memory, filling in null instances if needed
                 while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance()?);
@@ -1968,6 +1970,9 @@ impl CrosswordReplica {
                 ballot,
                 reqs_cw,
             } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 // locate instance in memory, filling in null instances if needed
                 while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance()?);
@@ -1994,6 +1999,9 @@ impl CrosswordReplica {
             }
 
             LogEntry::CommitSlot { slot } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 assert!(slot < self.start_slot + self.insts.len());
                 // update instance status
                 self.insts[slot - self.start_slot].status = Status::Committed;
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 7f1fb73b..aa42d2d1 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -1385,6 +1385,9 @@ impl MultiPaxosReplica {
     ) -> Result<(), SummersetError> {
         match entry {
             LogEntry::PrepareBal { slot, ballot } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 // locate instance in memory, filling in null instances if needed
                 while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance());
@@ -1404,6 +1407,9 @@ impl MultiPaxosReplica {
             }
 
             LogEntry::AcceptData { slot, ballot, reqs } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 // locate instance in memory, filling in null instances if needed
                 while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance());
@@ -1430,6 +1436,9 @@ impl MultiPaxosReplica {
             }
 
             LogEntry::CommitSlot { slot } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 assert!(slot < self.start_slot + self.insts.len());
                 // update instance status
                 self.insts[slot - self.start_slot].status = Status::Committed;
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index d6e79486..79c7a09a 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -1623,6 +1623,9 @@ impl RSPaxosReplica {
     ) -> Result<(), SummersetError> {
         match entry {
             LogEntry::PrepareBal { slot, ballot } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 // locate instance in memory, filling in null instances if needed
                 while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance()?);
@@ -1646,6 +1649,9 @@ impl RSPaxosReplica {
                 ballot,
                 reqs_cw,
             } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 // locate instance in memory, filling in null instances if needed
                 while self.start_slot + self.insts.len() <= slot {
                     self.insts.push(self.null_instance()?);
@@ -1672,6 +1678,9 @@ impl RSPaxosReplica {
             }
 
             LogEntry::CommitSlot { slot } => {
+                if slot < self.start_slot {
+                    return Ok(()); // ignore if slot index outdated
+                }
                 assert!(slot < self.start_slot + self.insts.len());
                 // update instance status
                 self.insts[slot - self.start_slot].status = Status::Committed;

From 1e8a8a08615ad5e6423ac4426a43ac0896ec93e2 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Fri, 6 Oct 2023 12:59:08 -0500
Subject: [PATCH 81/89] add fallback mechanism to Crossword

---
 src/protocols/crossword.rs | 225 ++++++++++++++++++++++++++++++-------
 src/protocols/rs_paxos.rs  |  52 ++++-----
 2 files changed, 208 insertions(+), 69 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 554965cd..7ac067aa 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -73,7 +73,7 @@ pub struct ReplicaConfigCrossword {
     pub recon_chunk_size: usize,
 
     /// Number of shards to assign to each replica.
-    // TODO: proper config options.
+    // TODO: think about how to allow unbalanced assignments.
     pub shards_per_replica: u8,
 
     // Performance simulation params (all zeros means no perf simulation):
@@ -265,7 +265,11 @@ pub struct CrosswordReplica {
     population: u8,
 
     /// Majority quorum size.
-    quorum_cnt: u8,
+    majority: u8,
+
+    /// Current #shards per replica configuration.
+    // TODO: probably needs something better for unbalanced assignments.
+    shards_per_replica: u8,
 
     /// Configuration parameters struct.
     config: ReplicaConfigCrossword,
@@ -363,14 +367,14 @@ impl CrosswordReplica {
             bal: 0,
             status: Status::Null,
             reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                self.quorum_cnt,
-                self.population - self.quorum_cnt,
+                self.majority,
+                self.population - self.majority,
             )?,
             voted: (
                 0,
                 RSCodeword::<ReqBatch>::from_null(
-                    self.quorum_cnt,
-                    self.population - self.quorum_cnt,
+                    self.majority,
+                    self.population - self.majority,
                 )?,
             ),
             leader_bk: None,
@@ -446,7 +450,7 @@ impl CrosswordReplica {
         (slot, cmd_idx)
     }
 
-    /// TODO: maybe remove this.
+    // TODO: think about how to allow unbalanced assignments.
     #[inline]
     fn shards_for_replica(
         slot: usize,
@@ -460,14 +464,14 @@ impl CrosswordReplica {
             .collect()
     }
 
-    /// TODO: should let leader incorporate assignment metadata in Accept
-    /// messages. With more complex assignment policies, a follower probably
-    /// does not know the assignment.
+    // TODO: should let leader incorporate assignment metadata in Accept
+    // messages. With more complex assignment policies, a follower probably
+    // does not know the assignment.
     fn gossip_targets_excl(
         slot: usize,
         me: ReplicaId,
         population: u8,
-        quorum_cnt: u8,
+        majority: u8,
         shards_per_replica: u8,
         mut avail_shards_map: Bitmap,
         replica_bk: &mut Option<ReplicaBookkeeping>,
@@ -483,7 +487,7 @@ impl CrosswordReplica {
                 src_peer = *source;
                 first_try = true;
                 // first try: exclude all parity shards
-                for idx in quorum_cnt..population {
+                for idx in majority..population {
                     avail_shards_map.set(idx, true).unwrap();
                 }
                 *gossip_tried = true;
@@ -529,16 +533,24 @@ impl CrosswordReplica {
         targets_excl
     }
 
-    /// TODO: make better impl of this.
+    // TODO: think about how to allow unbalanced assignments.
     fn coverage_under_faults(
         population: u8,
         acks: &HashMap<ReplicaId, Bitmap>,
         fault_tolerance: u8,
+        // if given, assume balanced assignment
+        shards_per_replica: Option<u8>,
     ) -> u8 {
         if acks.len() <= fault_tolerance as usize {
             return 0;
         }
 
+        // if assuming balanced assignment
+        if let Some(shards_per_replica) = shards_per_replica {
+            assert!(shards_per_replica > 0);
+            return acks.len() as u8 - fault_tolerance + shards_per_replica - 1;
+        }
+
         // enumerate all subsets of acks excluding fault number of replicas
         let cnt = (acks.len() - fault_tolerance as usize) as u32;
         let servers: Vec<ReplicaId> = acks.keys().cloned().collect();
@@ -571,6 +583,107 @@ impl CrosswordReplica {
 
         min_coverage
     }
+
+    /// Change to a new #shards_per_replica vs. quorum_size configuration. If
+    /// `redo_accepts` is true, redo all the instances that are currently in
+    /// the Accepting phase. This typically should happen when we are falling
+    /// back to a smaller quorum_size because of detected follower failures;
+    /// for performance-oriented config changes, this is not necessary.
+    // TODO: think about how to allow unbalanced assignments.
+    fn change_assignment_config(
+        &mut self,
+        shards_per_replica: u8,
+        redo_accepts: bool,
+    ) -> Result<(), SummersetError> {
+        assert!(shards_per_replica > 0);
+        if shards_per_replica > self.majority {
+            return Ok(()); // invalid, ignore
+        }
+
+        let quorum_size = self.majority + self.config.fault_tolerance + 1
+            - shards_per_replica;
+        self.shards_per_replica = shards_per_replica;
+        pf_info!(self.id; "switching assignment config: ({} - {}) {}",
+                          self.shards_per_replica, quorum_size,
+                          if redo_accepts { "redo" } else { "" });
+
+        if redo_accepts {
+            for (slot, inst) in self
+                .insts
+                .iter_mut()
+                .enumerate()
+                .map(|(s, i)| (self.start_slot + s, i))
+            {
+                if inst.status == Status::Accepting {
+                    assert!(inst.leader_bk.is_some());
+                    inst.bal = self.bal_prepared;
+                    inst.leader_bk.as_mut().unwrap().accept_acks.clear();
+                    pf_debug!(self.id; "enter Accept phase for slot {} bal {}",
+                               slot, inst.bal);
+
+                    // record update to largest accepted ballot and corresponding data
+                    let subset_copy = inst.reqs_cw.subset_copy(
+                        Bitmap::from(
+                            self.population,
+                            Self::shards_for_replica(
+                                slot,
+                                self.id,
+                                self.population,
+                                self.shards_per_replica,
+                            ),
+                        ),
+                        false,
+                    )?;
+                    inst.voted = (inst.bal, subset_copy.clone());
+                    self.storage_hub.submit_action(
+                        Self::make_log_action_id(slot, Status::Accepting),
+                        LogAction::Append {
+                            entry: LogEntry::AcceptData {
+                                slot,
+                                ballot: inst.bal,
+                                // persist only some shards on myself
+                                reqs_cw: subset_copy,
+                            },
+                            sync: self.config.logger_sync,
+                        },
+                    )?;
+                    pf_trace!(self.id; "submitted AcceptData log action for slot {} bal {}",
+                               slot, inst.bal);
+
+                    // send Accept messages to all peers, each getting its subset of
+                    // shards of data
+                    for peer in 0..self.population {
+                        if peer == self.id {
+                            continue;
+                        }
+                        self.transport_hub.send_msg(
+                            PeerMsg::Accept {
+                                slot,
+                                ballot: inst.bal,
+                                reqs_cw: inst.reqs_cw.subset_copy(
+                                    Bitmap::from(
+                                        self.population,
+                                        Self::shards_for_replica(
+                                            slot,
+                                            peer,
+                                            self.population,
+                                            self.shards_per_replica,
+                                        ),
+                                    ),
+                                    false,
+                                )?,
+                            },
+                            peer,
+                        )?;
+                    }
+                    pf_trace!(self.id; "broadcast Accept messages for slot {} bal {}",
+                               slot, inst.bal);
+                }
+            }
+        }
+
+        Ok(())
+    }
 }
 
 // CrosswordReplica client requests entrance
@@ -613,8 +726,8 @@ impl CrosswordReplica {
         // compute the complete Reed-Solomon codeword for the batch data
         let mut reqs_cw = RSCodeword::from_data(
             req_batch,
-            self.quorum_cnt,
-            self.population - self.quorum_cnt,
+            self.majority,
+            self.population - self.majority,
         )?;
         reqs_cw.compute_parity(Some(&self.rs_coder))?;
 
@@ -690,7 +803,7 @@ impl CrosswordReplica {
                         slot,
                         self.id,
                         self.population,
-                        self.config.shards_per_replica,
+                        self.shards_per_replica,
                     ),
                 ),
                 false,
@@ -728,7 +841,7 @@ impl CrosswordReplica {
                                     slot,
                                     peer,
                                     self.population,
-                                    self.config.shards_per_replica,
+                                    self.shards_per_replica,
                                 ),
                             ),
                             false,
@@ -846,12 +959,12 @@ impl CrosswordReplica {
                 let now_slot = self.commit_bar;
                 self.commit_bar += 1;
 
-                if inst.reqs_cw.avail_shards() < self.quorum_cnt {
+                if inst.reqs_cw.avail_shards() < self.majority {
                     // can't execute if I don't have the complete request batch
                     pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})",
-                                       slot, inst.reqs_cw.avail_shards(), self.quorum_cnt);
+                                       slot, inst.reqs_cw.avail_shards(), self.majority);
                     break;
-                } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                } else if inst.reqs_cw.avail_data_shards() < self.majority {
                     // have enough shards but need reconstruction
                     inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
                 }
@@ -1023,10 +1136,10 @@ impl CrosswordReplica {
             // reconstruct the original data, enter Accept phase for this
             // instance using the request batch value constructed using shards
             // with the highest ballot number in quorum
-            if leader_bk.prepare_acks.count() >= self.quorum_cnt
-                && inst.reqs_cw.avail_shards() >= self.quorum_cnt
+            if leader_bk.prepare_acks.count() >= self.majority
+                && inst.reqs_cw.avail_shards() >= self.majority
             {
-                if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                if inst.reqs_cw.avail_data_shards() < self.majority {
                     // have enough shards but need reconstruction
                     inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
                 }
@@ -1052,7 +1165,7 @@ impl CrosswordReplica {
                             slot,
                             self.id,
                             self.population,
-                            self.config.shards_per_replica,
+                            self.shards_per_replica,
                         ),
                     ),
                     false,
@@ -1088,7 +1201,7 @@ impl CrosswordReplica {
                                         slot,
                                         peer,
                                         self.population,
-                                        self.config.shards_per_replica,
+                                        self.shards_per_replica,
                                     ),
                                 ),
                                 false,
@@ -1202,19 +1315,20 @@ impl CrosswordReplica {
                         slot,
                         peer,
                         self.population,
-                        self.config.shards_per_replica,
+                        self.shards_per_replica,
                     ),
                 ),
             );
 
             // if quorum size reached AND enough number of shards are
             // remembered, mark this instance as committed
-            if leader_bk.accept_acks.len() as u8 >= self.quorum_cnt
+            if leader_bk.accept_acks.len() as u8 >= self.majority
                 && Self::coverage_under_faults(
                     self.population,
                     &leader_bk.accept_acks,
                     self.config.fault_tolerance,
-                ) >= self.quorum_cnt
+                    Some(self.shards_per_replica),
+                ) >= self.majority
             {
                 inst.status = Status::Committed;
                 pf_debug!(self.id; "committed instance at slot {} bal {}",
@@ -1357,12 +1471,12 @@ impl CrosswordReplica {
                     while now_slot < self.start_slot + self.insts.len() {
                         let inst = &mut self.insts[now_slot - self.start_slot];
                         if inst.status < Status::Committed
-                            || inst.reqs_cw.avail_shards() < self.quorum_cnt
+                            || inst.reqs_cw.avail_shards() < self.majority
                         {
                             break;
                         }
 
-                        if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                        if inst.reqs_cw.avail_data_shards() < self.majority {
                             // have enough shards but need reconstruction
                             inst.reqs_cw
                                 .reconstruct_data(Some(&self.rs_coder))?;
@@ -1510,6 +1624,18 @@ impl CrosswordReplica {
             if self.peer_alive.get(peer)? {
                 self.peer_alive.set(peer, false)?;
                 pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+                // check if we need to fall back to a config with smaller
+                // fast-path quorum size
+                let curr_quorum_size =
+                    self.majority + self.config.fault_tolerance + 1
+                        - self.shards_per_replica;
+                if self.peer_alive.count() < curr_quorum_size {
+                    self.change_assignment_config(
+                        self.shards_per_replica + curr_quorum_size
+                            - self.peer_alive.count(),
+                        true,
+                    )?;
+                }
             }
         }
 
@@ -1579,7 +1705,7 @@ impl CrosswordReplica {
             // complicated and slow to do the "data shards only" optimization
             // during fail-over, so just do this conservatively here
             if inst.status == Status::Committed
-                && inst.reqs_cw.avail_shards() < self.quorum_cnt
+                && inst.reqs_cw.avail_shards() < self.majority
             {
                 recon_slots.insert(slot, inst.reqs_cw.avail_shards_vec());
 
@@ -1649,6 +1775,18 @@ impl CrosswordReplica {
         // I also heard this heartbeat from myself
         self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
 
+        // check if we need to fall back to a config with smaller fast-path
+        // quorum size
+        let curr_quorum_size = self.majority + self.config.fault_tolerance + 1
+            - self.shards_per_replica;
+        if self.peer_alive.count() < curr_quorum_size {
+            self.change_assignment_config(
+                self.shards_per_replica + curr_quorum_size
+                    - self.peer_alive.count(),
+                true,
+            )?;
+        }
+
         // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
         Ok(())
     }
@@ -1765,14 +1903,14 @@ impl CrosswordReplica {
             let avail_shards_map = self.insts[slot - self.start_slot]
                 .reqs_cw
                 .avail_shards_map();
-            if avail_shards_map.count() < self.quorum_cnt {
+            if avail_shards_map.count() < self.majority {
                 // decide which peers to ask for which shards from
                 let targets_excl = Self::gossip_targets_excl(
                     slot,
                     self.id,
                     self.population,
-                    self.quorum_cnt,
-                    self.config.shards_per_replica,
+                    self.majority,
+                    self.shards_per_replica,
                     avail_shards_map,
                     &mut self.insts[slot - self.start_slot].replica_bk,
                 );
@@ -2017,11 +2155,11 @@ impl CrosswordReplica {
                         // update commit_bar
                         self.commit_bar += 1;
                         // check number of available shards
-                        if inst.reqs_cw.avail_shards() < self.quorum_cnt {
+                        if inst.reqs_cw.avail_shards() < self.majority {
                             // can't execute if I don't have the complete request batch
                             break;
                         } else if inst.reqs_cw.avail_data_shards()
-                            < self.quorum_cnt
+                            < self.majority
                         {
                             // have enough shards but need reconstruction
                             inst.reqs_cw
@@ -2110,7 +2248,7 @@ impl CrosswordReplica {
         let mut pairs = HashMap::new();
         for slot in self.start_slot..self.exec_bar {
             let inst = &mut self.insts[slot - self.start_slot];
-            assert!(inst.reqs_cw.avail_data_shards() >= self.quorum_cnt);
+            assert!(inst.reqs_cw.avail_data_shards() >= self.majority);
             for (_, req) in inst.reqs_cw.get_data()?.clone() {
                 if let ApiRequest::Req {
                     cmd: Command::Put { key, value },
@@ -2467,20 +2605,20 @@ impl GenericReplica for CrosswordReplica {
 
         // create a Reed-Solomon coder with num_data_shards == quorum size and
         // num_parity shards == population - quorum
-        let quorum_cnt = (population / 2) + 1;
-        if config.fault_tolerance > (population - quorum_cnt) {
+        let majority = (population / 2) + 1;
+        if config.fault_tolerance > (population - majority) {
             return logged_err!(id; "invalid config.fault_tolerance '{}'",
                                    config.fault_tolerance);
         }
         if config.shards_per_replica == 0
-            || config.shards_per_replica > quorum_cnt
+            || config.shards_per_replica > majority
         {
             return logged_err!(id; "invalid config.shards_per_replica '{}'",
                                    config.shards_per_replica);
         }
         let rs_coder = ReedSolomon::new(
-            quorum_cnt as usize,
-            (population - quorum_cnt) as usize,
+            majority as usize,
+            (population - majority) as usize,
         )?;
 
         // proactively connect to some peers, then wait for all population
@@ -2527,7 +2665,8 @@ impl GenericReplica for CrosswordReplica {
         Ok(CrosswordReplica {
             id,
             population,
-            quorum_cnt,
+            majority,
+            shards_per_replica: config.shards_per_replica,
             config,
             _api_addr: api_addr,
             _p2p_addr: p2p_addr,
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 79c7a09a..750ec48f 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -245,7 +245,7 @@ pub struct RSPaxosReplica {
     population: u8,
 
     /// Majority quorum size.
-    quorum_cnt: u8,
+    majority: u8,
 
     /// Configuration parameters struct.
     config: ReplicaConfigRSPaxos,
@@ -340,14 +340,14 @@ impl RSPaxosReplica {
             bal: 0,
             status: Status::Null,
             reqs_cw: RSCodeword::<ReqBatch>::from_null(
-                self.quorum_cnt,
-                self.population - self.quorum_cnt,
+                self.majority,
+                self.population - self.majority,
             )?,
             voted: (
                 0,
                 RSCodeword::<ReqBatch>::from_null(
-                    self.quorum_cnt,
-                    self.population - self.quorum_cnt,
+                    self.majority,
+                    self.population - self.majority,
                 )?,
             ),
             leader_bk: None,
@@ -464,8 +464,8 @@ impl RSPaxosReplica {
         // compute the complete Reed-Solomon codeword for the batch data
         let mut reqs_cw = RSCodeword::from_data(
             req_batch,
-            self.quorum_cnt,
-            self.population - self.quorum_cnt,
+            self.majority,
+            self.population - self.majority,
         )?;
         reqs_cw.compute_parity(Some(&self.rs_coder))?;
 
@@ -680,12 +680,12 @@ impl RSPaxosReplica {
                 let now_slot = self.commit_bar;
                 self.commit_bar += 1;
 
-                if inst.reqs_cw.avail_shards() < self.quorum_cnt {
+                if inst.reqs_cw.avail_shards() < self.majority {
                     // can't execute if I don't have the complete request batch
                     pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})",
-                                       slot, inst.reqs_cw.avail_shards(), self.quorum_cnt);
+                                       slot, inst.reqs_cw.avail_shards(), self.majority);
                     break;
-                } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                } else if inst.reqs_cw.avail_data_shards() < self.majority {
                     // have enough shards but need reconstruction
                     inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
                 }
@@ -854,10 +854,10 @@ impl RSPaxosReplica {
             // reconstruct the original data, enter Accept phase for this
             // instance using the request batch value constructed using shards
             // with the highest ballot number in quorum
-            if leader_bk.prepare_acks.count() >= self.quorum_cnt
-                && inst.reqs_cw.avail_shards() >= self.quorum_cnt
+            if leader_bk.prepare_acks.count() >= self.majority
+                && inst.reqs_cw.avail_shards() >= self.majority
             {
-                if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                if inst.reqs_cw.avail_data_shards() < self.majority {
                     // have enough shards but need reconstruction
                     inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?;
                 }
@@ -1010,9 +1010,9 @@ impl RSPaxosReplica {
 
             // if quorum size reached AND enough number of shards are
             // remembered, mark this instance as committed; in RS-Paxos, this
-            // means accept_acks.count() >= self.quorum_cnt + fault_tolerance
+            // means accept_acks.count() >= self.majority + fault_tolerance
             if leader_bk.accept_acks.count()
-                >= self.quorum_cnt + self.config.fault_tolerance
+                >= self.majority + self.config.fault_tolerance
             {
                 inst.status = Status::Committed;
                 pf_debug!(self.id; "committed instance at slot {} bal {}",
@@ -1150,12 +1150,12 @@ impl RSPaxosReplica {
                     while now_slot < self.start_slot + self.insts.len() {
                         let inst = &mut self.insts[now_slot - self.start_slot];
                         if inst.status < Status::Committed
-                            || inst.reqs_cw.avail_shards() < self.quorum_cnt
+                            || inst.reqs_cw.avail_shards() < self.majority
                         {
                             break;
                         }
 
-                        if inst.reqs_cw.avail_data_shards() < self.quorum_cnt {
+                        if inst.reqs_cw.avail_data_shards() < self.majority {
                             // have enough shards but need reconstruction
                             inst.reqs_cw
                                 .reconstruct_data(Some(&self.rs_coder))?;
@@ -1370,7 +1370,7 @@ impl RSPaxosReplica {
             // do reconstruction reads for all committed instances that do not
             // hold enough available shards for reconstruction
             if inst.status == Status::Committed
-                && inst.reqs_cw.avail_shards() < self.quorum_cnt
+                && inst.reqs_cw.avail_shards() < self.majority
             {
                 recon_slots.push(slot);
             }
@@ -1696,11 +1696,11 @@ impl RSPaxosReplica {
                         // update commit_bar
                         self.commit_bar += 1;
                         // check number of available shards
-                        if inst.reqs_cw.avail_shards() < self.quorum_cnt {
+                        if inst.reqs_cw.avail_shards() < self.majority {
                             // can't execute if I don't have the complete request batch
                             break;
                         } else if inst.reqs_cw.avail_data_shards()
-                            < self.quorum_cnt
+                            < self.majority
                         {
                             // have enough shards but need reconstruction
                             inst.reqs_cw
@@ -1789,7 +1789,7 @@ impl RSPaxosReplica {
         let mut pairs = HashMap::new();
         for slot in self.start_slot..self.exec_bar {
             let inst = &mut self.insts[slot - self.start_slot];
-            assert!(inst.reqs_cw.avail_data_shards() >= self.quorum_cnt);
+            assert!(inst.reqs_cw.avail_data_shards() >= self.majority);
             for (_, req) in inst.reqs_cw.get_data()?.clone() {
                 if let ApiRequest::Req {
                     cmd: Command::Put { key, value },
@@ -2144,14 +2144,14 @@ impl GenericReplica for RSPaxosReplica {
 
         // create a Reed-Solomon coder with num_data_shards == quorum size and
         // num_parity shards == population - quorum
-        let quorum_cnt = (population / 2) + 1;
-        if config.fault_tolerance > (population - quorum_cnt) {
+        let majority = (population / 2) + 1;
+        if config.fault_tolerance > (population - majority) {
             return logged_err!(id; "invalid config.fault_tolerance '{}'",
                                    config.fault_tolerance);
         }
         let rs_coder = ReedSolomon::new(
-            quorum_cnt as usize,
-            (population - quorum_cnt) as usize,
+            majority as usize,
+            (population - majority) as usize,
         )?;
 
         // proactively connect to some peers, then wait for all population
@@ -2198,7 +2198,7 @@ impl GenericReplica for RSPaxosReplica {
         Ok(RSPaxosReplica {
             id,
             population,
-            quorum_cnt,
+            majority,
             config,
             _api_addr: api_addr,
             _p2p_addr: p2p_addr,

From 1b04307924fc41d79d76241351ac721c91381e97 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Fri, 6 Oct 2023 13:07:13 -0500
Subject: [PATCH 82/89] minor updates to README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fb43e7d9..c5411262 100644
--- a/README.md
+++ b/README.md
@@ -160,8 +160,8 @@ Complete cluster management and benchmarking scripts are available in another re
 - [ ] implementation of CRaft
 - [x] implementation of Crossword prototype
   - [x] fault recovery reads
-  - [ ] follower gossiping
-  - [ ] fall-back mechanism
+  - [x] follower gossiping
+  - [x] fall-back mechanism
   - [ ] workload adaptiveness
   - [ ] unbalanced assignment
 - [x] client-side utilities

From 68066d9fd008aae7ec5c5136142f9801b8511632 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Fri, 6 Oct 2023 14:32:35 -0500
Subject: [PATCH 83/89] start working on Raft impl

---
 src/lib.rs            |   1 +
 src/protocols/mod.rs  |  18 +++
 src/protocols/raft.rs | 279 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 298 insertions(+)
 create mode 100644 src/protocols/raft.rs

diff --git a/src/lib.rs b/src/lib.rs
index 2de53e51..7d7a4f2a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -35,5 +35,6 @@ pub use crate::protocols::SmrProtocol;
 pub use crate::protocols::{ReplicaConfigRepNothing, ClientConfigRepNothing};
 pub use crate::protocols::{ReplicaConfigSimplePush, ClientConfigSimplePush};
 pub use crate::protocols::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos};
+pub use crate::protocols::{ReplicaConfigRaft, ClientConfigRaft};
 pub use crate::protocols::{ReplicaConfigRSPaxos, ClientConfigRSPaxos};
 pub use crate::protocols::{ReplicaConfigCrossword, ClientConfigCrossword};
diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs
index 3aae79bf..25cb4ede 100644
--- a/src/protocols/mod.rs
+++ b/src/protocols/mod.rs
@@ -22,6 +22,10 @@ mod multipaxos;
 use multipaxos::{MultiPaxosReplica, MultiPaxosClient};
 pub use multipaxos::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos};
 
+mod raft;
+use raft::{RaftReplica, RaftClient};
+pub use raft::{ReplicaConfigRaft, ClientConfigRaft};
+
 mod rs_paxos;
 use rs_paxos::{RSPaxosReplica, RSPaxosClient};
 pub use rs_paxos::{ReplicaConfigRSPaxos, ClientConfigRSPaxos};
@@ -36,6 +40,7 @@ pub enum SmrProtocol {
     RepNothing,
     SimplePush,
     MultiPaxos,
+    Raft,
     RSPaxos,
     Crossword,
 }
@@ -56,6 +61,7 @@ impl SmrProtocol {
             "RepNothing" => Some(Self::RepNothing),
             "SimplePush" => Some(Self::SimplePush),
             "MultiPaxos" => Some(Self::MultiPaxos),
+            "Raft" => Some(Self::Raft),
             "RSPaxos" => Some(Self::RSPaxos),
             "Crossword" => Some(Self::Crossword),
             _ => None,
@@ -106,6 +112,14 @@ impl SmrProtocol {
                     .await
                 )
             }
+            Self::Raft => {
+                box_if_ok!(
+                    RaftReplica::new_and_setup(
+                        api_addr, p2p_addr, manager, config_str
+                    )
+                    .await
+                )
+            }
             Self::RSPaxos => {
                 box_if_ok!(
                     RSPaxosReplica::new_and_setup(
@@ -147,6 +161,9 @@ impl SmrProtocol {
                     MultiPaxosClient::new_and_setup(manager, config_str).await
                 )
             }
+            Self::Raft => {
+                box_if_ok!(RaftClient::new_and_setup(manager, config_str).await)
+            }
             Self::RSPaxos => {
                 box_if_ok!(
                     RSPaxosClient::new_and_setup(manager, config_str).await
@@ -185,6 +202,7 @@ mod protocols_name_tests {
         valid_name_test!(RepNothing);
         valid_name_test!(SimplePush);
         valid_name_test!(MultiPaxos);
+        valid_name_test!(Raft);
         valid_name_test!(RSPaxos);
         valid_name_test!(Crossword);
     }
diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs
new file mode 100644
index 00000000..f4a60f47
--- /dev/null
+++ b/src/protocols/raft.rs
@@ -0,0 +1,279 @@
+//! Replication protocol: Raft.
+//!
+//! References:
+//!   - <https://raft.github.io/raft.pdf>
+//!   - <https://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf>
+
+use crate::utils::{SummersetError, Bitmap, Timer};
+
+/// Configuration parameters struct.
+#[derive(Debug, Deserialize)]
+pub struct ReplicaConfigRaft {
+    /// Client request batching interval in microsecs.
+    pub batch_interval_us: u64,
+
+    /// Client request batching maximum batch size.
+    pub max_batch_size: usize,
+
+    /// Path to backing log file.
+    pub backer_path: String,
+
+    /// Whether to call `fsync()`/`fdatasync()` on logger.
+    pub logger_sync: bool,
+
+    // Performance simulation params (all zeros means no perf simulation):
+    pub perf_storage_a: u64,
+    pub perf_storage_b: u64,
+    pub perf_network_a: u64,
+    pub perf_network_b: u64,
+}
+
+#[allow(clippy::derivable_impls)]
+impl Default for ReplicaConfigRaft {
+    fn default() -> Self {
+        ReplicaConfigRaft {
+            batch_interval_us: 1000,
+            max_batch_size: 5000,
+            backer_path: "/tmp/summerset.raft.wal".into(),
+            logger_sync: false,
+            perf_storage_a: 0,
+            perf_storage_b: 0,
+            perf_network_a: 0,
+            perf_network_b: 0,
+        }
+    }
+}
+
+/// Raft server replica module.
+pub struct RaftReplica {
+    /// Replica ID in cluster.
+    id: ReplicaId,
+
+    /// Total number of replicas in cluster.
+    population: u8,
+
+    /// Majority quorum size.
+    quorum_cnt: u8,
+
+    /// Configuration parameters struct.
+    config: ReplicaConfigRaft,
+}
+
+#[async_trait]
+impl GenericReplica for RaftReplica {
+    async fn new_and_setup(
+        api_addr: SocketAddr,
+        p2p_addr: SocketAddr,
+        manager: SocketAddr,
+        config_str: Option<&str>,
+    ) -> Result<Self, SummersetError> {
+        Ok(RaftReplica {
+            id,
+            population,
+            quorum_cnt: (population / 2) + 1,
+            config,
+            _api_addr: api_addr,
+            _p2p_addr: p2p_addr,
+        })
+    }
+
+    async fn run(
+        &mut self,
+        mut rx_term: watch::Receiver<bool>,
+    ) -> Result<bool, SummersetError> {
+    }
+
+    fn id(&self) -> ReplicaId {
+        self.id
+    }
+}
+
+/// Configuration parameters struct.
+#[derive(Debug, Deserialize)]
+pub struct ClientConfigRaft {
+    /// Which server to pick initially.
+    pub init_server_id: ReplicaId,
+}
+
+#[allow(clippy::derivable_impls)]
+impl Default for ClientConfigRaft {
+    fn default() -> Self {
+        ClientConfigRaft { init_server_id: 0 }
+    }
+}
+
+/// Raft client-side module.
+pub struct RaftClient {
+    /// Client ID.
+    id: ClientId,
+
+    /// Configuration parameters struct.
+    _config: ClientConfigRaft,
+
+    /// List of active servers information.
+    servers: HashMap<ReplicaId, SocketAddr>,
+
+    /// Current server ID to talk to.
+    server_id: ReplicaId,
+
+    /// Control API stub to the cluster manager.
+    ctrl_stub: ClientCtrlStub,
+
+    /// API stubs for communicating with servers.
+    api_stubs: HashMap<ReplicaId, ClientApiStub>,
+}
+
+#[async_trait]
+impl GenericEndpoint for RaftClient {
+    async fn new_and_setup(
+        manager: SocketAddr,
+        config_str: Option<&str>,
+    ) -> Result<Self, SummersetError> {
+        // connect to the cluster manager and get assigned a client ID
+        pf_info!("c"; "connecting to manager '{}'...", manager);
+        let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?;
+        let id = ctrl_stub.id;
+
+        // parse protocol-specific configs
+        let config = parsed_config!(config_str => ClientConfigRaft;
+                                    init_server_id)?;
+        let init_server_id = config.init_server_id;
+
+        Ok(RaftClient {
+            id,
+            _config: config,
+            servers: HashMap::new(),
+            server_id: init_server_id,
+            ctrl_stub,
+            api_stubs: HashMap::new(),
+        })
+    }
+
+    async fn connect(&mut self) -> Result<(), SummersetError> {
+        // disallow reconnection without leaving
+        if !self.api_stubs.is_empty() {
+            return logged_err!(self.id; "reconnecting without leaving");
+        }
+
+        // ask the manager about the list of active servers
+        let mut sent =
+            self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?;
+        while !sent {
+            sent = self.ctrl_stub.send_req(None)?;
+        }
+
+        let reply = self.ctrl_stub.recv_reply().await?;
+        match reply {
+            CtrlReply::QueryInfo {
+                population,
+                servers,
+            } => {
+                // shift to a new server_id if current one not active
+                assert!(!servers.is_empty());
+                while !servers.contains_key(&self.server_id) {
+                    self.server_id = (self.server_id + 1) % population;
+                }
+                // establish connection to all servers
+                self.servers = servers
+                    .into_iter()
+                    .map(|(id, info)| (id, info.0))
+                    .collect();
+                for (&id, &server) in &self.servers {
+                    pf_info!(self.id; "connecting to server {} '{}'...", id, server);
+                    let api_stub =
+                        ClientApiStub::new_by_connect(self.id, server).await?;
+                    self.api_stubs.insert(id, api_stub);
+                }
+                Ok(())
+            }
+            _ => logged_err!(self.id; "unexpected reply type received"),
+        }
+    }
+
+    async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> {
+        // send leave notification to all servers
+        for (id, mut api_stub) in self.api_stubs.drain() {
+            let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?;
+            while !sent {
+                sent = api_stub.send_req(None)?;
+            }
+
+            while api_stub.recv_reply().await? != ApiReply::Leave {}
+            pf_info!(self.id; "left server connection {}", id);
+            api_stub.forget();
+        }
+
+        // if permanently leaving, send leave notification to the manager
+        if permanent {
+            let mut sent =
+                self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?;
+            while !sent {
+                sent = self.ctrl_stub.send_req(None)?;
+            }
+
+            while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {}
+            pf_info!(self.id; "left manager connection");
+        }
+
+        Ok(())
+    }
+
+    fn send_req(
+        &mut self,
+        req: Option<&ApiRequest>,
+    ) -> Result<bool, SummersetError> {
+        if self.api_stubs.contains_key(&self.server_id) {
+            self.api_stubs
+                .get_mut(&self.server_id)
+                .unwrap()
+                .send_req(req)
+        } else {
+            Err(SummersetError(format!(
+                "server_id {} not in api_stubs",
+                self.server_id
+            )))
+        }
+    }
+
+    async fn recv_reply(&mut self) -> Result<ApiReply, SummersetError> {
+        if self.api_stubs.contains_key(&self.server_id) {
+            let reply = self
+                .api_stubs
+                .get_mut(&self.server_id)
+                .unwrap()
+                .recv_reply()
+                .await?;
+
+            if let ApiReply::Reply {
+                ref result,
+                ref redirect,
+                ..
+            } = reply
+            {
+                // if the current server redirects me to a different server
+                if result.is_none() && redirect.is_some() {
+                    let redirect_id = redirect.unwrap();
+                    assert!(self.servers.contains_key(&redirect_id));
+                    self.server_id = redirect_id;
+                    pf_debug!(self.id; "redirected to replica {} '{}'",
+                                       redirect_id, self.servers[&redirect_id]);
+                }
+            }
+
+            Ok(reply)
+        } else {
+            Err(SummersetError(format!(
+                "server_id {} not in api_stubs",
+                self.server_id
+            )))
+        }
+    }
+
+    fn id(&self) -> ClientId {
+        self.id
+    }
+
+    fn ctrl_stub(&mut self) -> &mut ClientCtrlStub {
+        &mut self.ctrl_stub
+    }
+}

From 90b13fd74e67f538eaca873af9473ccab4047ee5 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sat, 7 Oct 2023 17:16:29 -0500
Subject: [PATCH 84/89] rename LogEntry to WalEntry for Paxos-style impls

---
 README.md                    |  10 +-
 scripts/local_cluster.py     |   2 +
 src/protocols/crossword.rs   | 272 ++++++++++++++++++-----------------
 src/protocols/multipaxos.rs  | 126 +++++++++-------
 src/protocols/raft.rs        | 198 ++++++++++++++++++++++++-
 src/protocols/rep_nothing.rs |  54 +++----
 src/protocols/rs_paxos.rs    | 126 +++++++++-------
 src/protocols/simple_push.rs |  62 ++++----
 src/server/transport.rs      |   9 +-
 9 files changed, 554 insertions(+), 305 deletions(-)

diff --git a/README.md b/README.md
index c5411262..d968b5d5 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ git push origin <PR_name>
 [![Proc tests status](https://github.com/josehu07/summerset/actions/workflows/tests_proc.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests_proc)
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 
-Summerset is a distributed key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added.
+Summerset is a distributed, replicated, protocol-generic key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added.
 
 <p align="center">
   <img width="360" src="./README.png">
@@ -69,6 +69,7 @@ Formal TLA+ specification of some protocols are provided in `tla+/`.
 - **Async Rust**: Summerset is written in Rust and demonstrates canonical usage of async programming structures backed by the [`tokio`](https://tokio.rs/) framework;
 - **Event-based**: Summerset adopts a channel-oriented, event-based system architecture; each replication protocol is basically just a set of event handlers plus a `tokio::select!` loop;
 - **Modularized**: Common components of a distributed KV store, e.g. network transport and durable logger, are cleanly separated from each other and connected through channels.
+- **Protocol-generic**: With the above two points combined, Summerset is able to support a set of different replication protocols in one codebase, each being just a single file, with common functionalities abstracted out.
 
 These design choices make protocol implementation in Summerset surprisingly straight-forward and **understandable**, without any sacrifice on performance. Comments / issues / PRs are always welcome!
 
@@ -155,15 +156,22 @@ Complete cluster management and benchmarking scripts are available in another re
   - [ ] specialize read-only commands?
   - [ ] separate commit vs. exec responses?
   - [ ] membership discovery & view changes
+  - [x] TLA+ spec
 - [x] implementation of RS-Paxos
+  - [ ] TLA+ spec
 - [ ] implementation of Raft
+  - [ ] snapshotting & garbage collection
+  - [ ] membership discovery & view changes
+  - [ ] TLA+ spec
 - [ ] implementation of CRaft
+  - [ ] TLA+ spec
 - [x] implementation of Crossword prototype
   - [x] fault recovery reads
   - [x] follower gossiping
   - [x] fall-back mechanism
   - [ ] workload adaptiveness
   - [ ] unbalanced assignment
+  - [ ] TLA+ spec
 - [x] client-side utilities
   - [x] REPL-style client
   - [x] random benchmarking client
diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py
index a088ed81..9c47d048 100644
--- a/scripts/local_cluster.py
+++ b/scripts/local_cluster.py
@@ -44,12 +44,14 @@ def kill_all_matching(name, force=False):
     "RepNothing": lambda r: f"backer_path='/tmp/summerset.rep_nothing.{r}.wal'",
     "SimplePush": lambda r: f"backer_path='/tmp/summerset.simple_push.{r}.wal'",
     "MultiPaxos": lambda r: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'",
+    "Raft": lambda r: f"backer_path='/tmp/summerset.raft.{r}.wal'",
     "RSPaxos": lambda r: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'",
     "Crossword": lambda r: f"backer_path='/tmp/summerset.crossword.{r}.wal'",
 }
 
 PROTOCOL_SNAPSHOT_PATH = {
     "MultiPaxos": lambda r: f"snapshot_path='/tmp/summerset.multipaxos.{r}.snap'",
+    "Raft": lambda r: f"snapshot_path='/tmp/summerset.raft.{r}.snap'",
     "RSPaxos": lambda r: f"snapshot_path='/tmp/summerset.rs_paxos.{r}.snap'",
     "Crossword": lambda r: f"snapshot_path='/tmp/summerset.crossword.{r}.snap'",
 }
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 7ac067aa..69028ce8 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -1,8 +1,8 @@
 //! Replication protocol: Crossword.
 //!
-//! MultiPaxos with flexible Reed-Solomon erasure coding that supports tunable
-//! shard groups, asymmetric shard assignment, and follower gossiping for actual
-//! usability.
+//! MultiPaxos with flexible Reed-Solomon erasure code sharding that supports
+//! dynamically tunable shard assignment with the correct liveness constraints,
+//! plus follower gossiping for actual usability.
 
 use std::collections::HashMap;
 use std::path::Path;
@@ -34,8 +34,8 @@ use reed_solomon_erasure::galois_8::ReedSolomon;
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
 pub struct ReplicaConfigCrossword {
-    /// Client request batching interval in microsecs.
-    pub batch_interval_us: u64,
+    /// Client request batching interval in millisecs.
+    pub batch_interval_ms: u64,
 
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
@@ -87,7 +87,7 @@ pub struct ReplicaConfigCrossword {
 impl Default for ReplicaConfigCrossword {
     fn default() -> Self {
         ReplicaConfigCrossword {
-            batch_interval_us: 1000,
+            batch_interval_ms: 10,
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.rs_paxos.wal".into(),
             logger_sync: false,
@@ -176,14 +176,20 @@ struct Instance {
     external: bool,
 
     /// Offset of first durable WAL log entry related to this instance.
-    log_offset: usize,
+    wal_offset: usize,
 }
 
-/// Stable storage log entry type.
+/// Stable storage WAL log entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
-enum LogEntry {
+enum WalEntry {
     /// Records an update to the largest prepare ballot seen.
-    PrepareBal { slot: usize, ballot: Ballot },
+    PrepareBal {
+        /// Slot index in Prepare message is the triggering slot of this
+        /// Prepare. Once prepared, it means that all slots in the range
+        /// [slot, +infinity) are prepared under this ballot number.
+        slot: usize,
+        ballot: Ballot,
+    },
 
     /// Records a newly accepted request batch data shards at slot index.
     AcceptData {
@@ -197,6 +203,10 @@ enum LogEntry {
 }
 
 /// Snapshot file entry type.
+///
+/// NOTE: the current implementation simply appends a squashed log at the
+/// end of the snapshot file for simplicity. In production, the snapshot
+/// file should be a bounded-sized backend, e.g., an LSM-tree.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum SnapEntry {
     /// Necessary slot indices to remember.
@@ -290,7 +300,7 @@ pub struct CrosswordReplica {
     state_machine: StateMachine,
 
     /// StorageHub module.
-    storage_hub: StorageHub<LogEntry>,
+    storage_hub: StorageHub<WalEntry>,
 
     /// StorageHub module for the snapshot file.
     snapshot_hub: StorageHub<SnapEntry>,
@@ -342,8 +352,8 @@ pub struct CrosswordReplica {
     /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len()
     exec_bar: usize,
 
-    /// Current durable log file offset.
-    log_offset: usize,
+    /// Current durable WAL log file offset.
+    wal_offset: usize,
 
     /// Current durable snapshot file offset.
     snap_offset: usize,
@@ -380,7 +390,7 @@ impl CrosswordReplica {
             leader_bk: None,
             replica_bk: None,
             external: false,
-            log_offset: 0,
+            wal_offset: 0,
         })
     }
 
@@ -464,75 +474,6 @@ impl CrosswordReplica {
             .collect()
     }
 
-    // TODO: should let leader incorporate assignment metadata in Accept
-    // messages. With more complex assignment policies, a follower probably
-    // does not know the assignment.
-    fn gossip_targets_excl(
-        slot: usize,
-        me: ReplicaId,
-        population: u8,
-        majority: u8,
-        shards_per_replica: u8,
-        mut avail_shards_map: Bitmap,
-        replica_bk: &mut Option<ReplicaBookkeeping>,
-    ) -> HashMap<ReplicaId, Vec<u8>> {
-        let mut src_peer = me;
-        let mut first_try = false;
-        if let Some(ReplicaBookkeeping {
-            source,
-            gossip_tried,
-        }) = replica_bk
-        {
-            if !*gossip_tried {
-                src_peer = *source;
-                first_try = true;
-                // first try: exclude all parity shards
-                for idx in majority..population {
-                    avail_shards_map.set(idx, true).unwrap();
-                }
-                *gossip_tried = true;
-            }
-        }
-
-        // greedily considers my peers, starting from the one with my ID + 1,
-        // until all data shards covered
-        let mut targets_excl = HashMap::new();
-        for p in (me + 1)..(population + me) {
-            let peer = p % population;
-            if !first_try {
-                // first try probably did not succeed, so do it conservatively
-                targets_excl.insert(peer, avail_shards_map.to_vec());
-            } else {
-                // skip leader who initially replicated this instance to me
-                if peer == src_peer {
-                    continue;
-                }
-                // first try: only ask for a minimum number of data shards
-                let mut useful_shards = Vec::new();
-                for idx in Self::shards_for_replica(
-                    slot,
-                    peer,
-                    population,
-                    shards_per_replica,
-                ) {
-                    if !avail_shards_map.get(idx).unwrap() {
-                        useful_shards.push(idx);
-                    }
-                }
-                // if this peer has data shards which I don't have right now
-                // and I have not asked others for in this round
-                if !useful_shards.is_empty() {
-                    targets_excl.insert(peer, avail_shards_map.to_vec());
-                    for idx in useful_shards {
-                        avail_shards_map.set(idx, true).unwrap();
-                    }
-                }
-            }
-        }
-
-        targets_excl
-    }
-
     // TODO: think about how to allow unbalanced assignments.
     fn coverage_under_faults(
         population: u8,
@@ -638,7 +579,7 @@ impl CrosswordReplica {
                     self.storage_hub.submit_action(
                         Self::make_log_action_id(slot, Status::Accepting),
                         LogAction::Append {
-                            entry: LogEntry::AcceptData {
+                            entry: WalEntry::AcceptData {
                                 slot,
                                 ballot: inst.bal,
                                 // persist only some shards on myself
@@ -767,7 +708,7 @@ impl CrosswordReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Preparing),
                 LogAction::Append {
-                    entry: LogEntry::PrepareBal {
+                    entry: WalEntry::PrepareBal {
                         slot,
                         ballot: self.bal_prep_sent,
                     },
@@ -812,7 +753,7 @@ impl CrosswordReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
-                    entry: LogEntry::AcceptData {
+                    entry: WalEntry::AcceptData {
                         slot,
                         ballot: inst.bal,
                         // persist only some shards on myself
@@ -998,7 +939,7 @@ impl CrosswordReplica {
     fn handle_log_result(
         &mut self,
         action_id: LogActionId,
-        log_result: LogResult<LogEntry>,
+        log_result: LogResult<WalEntry>,
     ) -> Result<(), SummersetError> {
         let (slot, entry_type) = Self::split_log_action_id(action_id);
         if slot < self.start_slot {
@@ -1007,15 +948,15 @@ impl CrosswordReplica {
         assert!(slot < self.start_slot + self.insts.len());
 
         if let LogResult::Append { now_size } = log_result {
-            assert!(now_size >= self.log_offset);
-            // update first log_offset of slot
+            assert!(now_size >= self.wal_offset);
+            // update first wal_offset of slot
             let inst = &mut self.insts[slot - self.start_slot];
-            if inst.log_offset == 0 || inst.log_offset > self.log_offset {
-                inst.log_offset = self.log_offset;
+            if inst.wal_offset == 0 || inst.wal_offset > self.wal_offset {
+                inst.wal_offset = self.wal_offset;
             }
-            assert!(inst.log_offset <= self.log_offset);
-            // then update self.log_offset
-            self.log_offset = now_size;
+            assert!(inst.wal_offset <= self.wal_offset);
+            // then update self.wal_offset
+            self.wal_offset = now_size;
         } else {
             return logged_err!(self.id; "unexpected log result type: {:?}", log_result);
         }
@@ -1069,7 +1010,7 @@ impl CrosswordReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Preparing),
                 LogAction::Append {
-                    entry: LogEntry::PrepareBal { slot, ballot },
+                    entry: WalEntry::PrepareBal { slot, ballot },
                     sync: self.config.logger_sync,
                 },
             )?;
@@ -1174,7 +1115,7 @@ impl CrosswordReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Accepting),
                     LogAction::Append {
-                        entry: LogEntry::AcceptData {
+                        entry: WalEntry::AcceptData {
                             slot,
                             ballot,
                             reqs_cw: subset_copy,
@@ -1257,7 +1198,7 @@ impl CrosswordReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
-                    entry: LogEntry::AcceptData {
+                    entry: WalEntry::AcceptData {
                         slot,
                         ballot,
                         reqs_cw: inst.reqs_cw.clone(),
@@ -1338,7 +1279,7 @@ impl CrosswordReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Committed),
                     LogAction::Append {
-                        entry: LogEntry::CommitSlot { slot },
+                        entry: WalEntry::CommitSlot { slot },
                         sync: self.config.logger_sync,
                     },
                 )?;
@@ -1387,7 +1328,7 @@ impl CrosswordReplica {
         self.storage_hub.submit_action(
             Self::make_log_action_id(slot, Status::Committed),
             LogAction::Append {
-                entry: LogEntry::CommitSlot { slot },
+                entry: WalEntry::CommitSlot { slot },
                 sync: self.config.logger_sync,
             },
         )?;
@@ -1678,7 +1619,7 @@ impl CrosswordReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Preparing),
                     LogAction::Append {
-                        entry: LogEntry::PrepareBal {
+                        entry: WalEntry::PrepareBal {
                             slot,
                             ballot: self.bal_prep_sent,
                         },
@@ -1875,6 +1816,75 @@ impl CrosswordReplica {
         Ok(())
     }
 
+    // TODO: should let leader incorporate assignment metadata in Accept
+    // messages. With more complex assignment policies, a follower probably
+    // does not know the assignment.
+    fn gossip_targets_excl(
+        slot: usize,
+        me: ReplicaId,
+        population: u8,
+        majority: u8,
+        shards_per_replica: u8,
+        mut avail_shards_map: Bitmap,
+        replica_bk: &mut Option<ReplicaBookkeeping>,
+    ) -> HashMap<ReplicaId, Vec<u8>> {
+        let mut src_peer = me;
+        let mut first_try = false;
+        if let Some(ReplicaBookkeeping {
+            source,
+            gossip_tried,
+        }) = replica_bk
+        {
+            if !*gossip_tried {
+                src_peer = *source;
+                first_try = true;
+                // first try: exclude all parity shards
+                for idx in majority..population {
+                    avail_shards_map.set(idx, true).unwrap();
+                }
+                *gossip_tried = true;
+            }
+        }
+
+        // greedily considers my peers, starting from the one with my ID + 1,
+        // until all data shards covered
+        let mut targets_excl = HashMap::new();
+        for p in (me + 1)..(population + me) {
+            let peer = p % population;
+            if !first_try {
+                // first try probably did not succeed, so do it conservatively
+                targets_excl.insert(peer, avail_shards_map.to_vec());
+            } else {
+                // skip leader who initially replicated this instance to me
+                if peer == src_peer {
+                    continue;
+                }
+                // first try: only ask for a minimum number of data shards
+                let mut useful_shards = Vec::new();
+                for idx in Self::shards_for_replica(
+                    slot,
+                    peer,
+                    population,
+                    shards_per_replica,
+                ) {
+                    if !avail_shards_map.get(idx).unwrap() {
+                        useful_shards.push(idx);
+                    }
+                }
+                // if this peer has data shards which I don't have right now
+                // and I have not asked others for in this round
+                if !useful_shards.is_empty() {
+                    targets_excl.insert(peer, avail_shards_map.to_vec());
+                    for idx in useful_shards {
+                        avail_shards_map.set(idx, true).unwrap();
+                    }
+                }
+            }
+        }
+
+        targets_excl
+    }
+
     /// Triggers gossiping for my missing shards in committed but not-yet-
     /// executed instances: fetch missing shards from peers, preferring
     /// follower peers that hold data shards.
@@ -2078,10 +2088,10 @@ impl CrosswordReplica {
     /// Apply a durable storage log entry for recovery.
     async fn recover_apply_entry(
         &mut self,
-        entry: LogEntry,
+        entry: WalEntry,
     ) -> Result<(), SummersetError> {
         match entry {
-            LogEntry::PrepareBal { slot, ballot } => {
+            WalEntry::PrepareBal { slot, ballot } => {
                 if slot < self.start_slot {
                     return Ok(()); // ignore if slot index outdated
                 }
@@ -2103,7 +2113,7 @@ impl CrosswordReplica {
                 self.bal_prepared = 0;
             }
 
-            LogEntry::AcceptData {
+            WalEntry::AcceptData {
                 slot,
                 ballot,
                 reqs_cw,
@@ -2136,7 +2146,7 @@ impl CrosswordReplica {
                 assert!(self.bal_prepared <= self.bal_prep_sent);
             }
 
-            LogEntry::CommitSlot { slot } => {
+            WalEntry::CommitSlot { slot } => {
                 if slot < self.start_slot {
                     return Ok(()); // ignore if slot index outdated
                 }
@@ -2185,15 +2195,15 @@ impl CrosswordReplica {
         Ok(())
     }
 
-    /// Recover state from durable storage log.
-    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
-        assert_eq!(self.log_offset, 0);
+    /// Recover state from durable storage WAL log.
+    async fn recover_from_wal(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.wal_offset, 0);
         loop {
             // using 0 as a special log action ID
             self.storage_hub.submit_action(
                 0,
                 LogAction::Read {
-                    offset: self.log_offset,
+                    offset: self.wal_offset,
                 },
             )?;
             let (_, log_result) = self.storage_hub.get_result().await?;
@@ -2205,7 +2215,7 @@ impl CrosswordReplica {
                 } => {
                     self.recover_apply_entry(entry).await?;
                     // update log offset
-                    self.log_offset = end_offset;
+                    self.wal_offset = end_offset;
                 }
                 LogResult::Read { entry: None, .. } => {
                     // end of log reached
@@ -2221,7 +2231,7 @@ impl CrosswordReplica {
         self.storage_hub.submit_action(
             0,
             LogAction::Truncate {
-                offset: self.log_offset,
+                offset: self.wal_offset,
             },
         )?;
         let (_, log_result) = self.storage_hub.get_result().await?;
@@ -2229,7 +2239,7 @@ impl CrosswordReplica {
             offset_ok: true, ..
         } = log_result
         {
-            if self.log_offset > 0 {
+            if self.wal_offset > 0 {
                 pf_info!(self.id; "recovered from wal log: commit {} exec {}",
                                   self.commit_bar, self.exec_bar);
             }
@@ -2242,7 +2252,7 @@ impl CrosswordReplica {
 
 // CrosswordReplica snapshotting & GC logic
 impl CrosswordReplica {
-    /// Dump a new key-value pair to snapshot file.
+    /// Dump new key-value pairs to snapshot file.
     async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
         let mut pairs = HashMap::new();
@@ -2283,9 +2293,9 @@ impl CrosswordReplica {
     /// Discard everything older than start_slot in durable WAL log.
     async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> {
         let cut_offset = if !self.insts.is_empty() {
-            self.insts[0].log_offset
+            self.insts[0].wal_offset
         } else {
-            self.log_offset
+            self.wal_offset
         };
 
         // discard the log before cut_offset
@@ -2304,8 +2314,8 @@ impl CrosswordReplica {
                         now_size,
                     } = log_result
                     {
-                        assert_eq!(self.log_offset - cut_offset, now_size);
-                        self.log_offset = now_size;
+                        assert_eq!(self.wal_offset - cut_offset, now_size);
+                        self.wal_offset = now_size;
                     } else {
                         return logged_err!(
                             self.id;
@@ -2317,11 +2327,11 @@ impl CrosswordReplica {
             }
         }
 
-        // update inst.log_offset for all remaining in-mem instances
+        // update inst.wal_offset for all remaining in-mem instances
         for inst in &mut self.insts {
-            if inst.log_offset > 0 {
-                assert!(inst.log_offset >= cut_offset);
-                inst.log_offset -= cut_offset;
+            if inst.wal_offset > 0 {
+                assert!(inst.wal_offset >= cut_offset);
+                inst.wal_offset -= cut_offset;
             }
         }
 
@@ -2334,6 +2344,12 @@ impl CrosswordReplica {
     /// NOTE: the current implementation does not guard against crashes in the
     /// middle of taking a snapshot. Production quality implementations should
     /// make the snapshotting action "atomic".
+    ///
+    /// NOTE: the current implementation does not take care of InstallSnapshot
+    /// messages (which is needed when some lagging follower has some slot
+    /// which all other peers have snapshotted); we assume here that failed
+    /// Accept messages will be retried indefinitely until success before its
+    /// associated data gets discarded from leader's memory.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
         pf_debug!(self.id; "taking new snapshot: start {} exec {}",
                            self.start_slot, self.exec_bar);
@@ -2512,7 +2528,7 @@ impl GenericReplica for CrosswordReplica {
 
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigCrossword;
-                                    batch_interval_us, max_batch_size,
+                                    batch_interval_ms, max_batch_size,
                                     backer_path, logger_sync,
                                     hb_hear_timeout_min, hb_hear_timeout_max,
                                     hb_send_interval_ms,
@@ -2522,11 +2538,11 @@ impl GenericReplica for CrosswordReplica {
                                     shards_per_replica,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
-        if config.batch_interval_us == 0 {
+        if config.batch_interval_ms == 0 {
             return logged_err!(
                 id;
-                "invalid config.batch_interval_us '{}'",
-                config.batch_interval_us
+                "invalid config.batch_interval_ms '{}'",
+                config.batch_interval_ms
             );
         }
         if config.hb_hear_timeout_min < 100 {
@@ -2640,7 +2656,7 @@ impl GenericReplica for CrosswordReplica {
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
-            Duration::from_micros(config.batch_interval_us),
+            Duration::from_millis(config.batch_interval_ms),
             config.max_batch_size,
         )
         .await?;
@@ -2690,7 +2706,7 @@ impl GenericReplica for CrosswordReplica {
             bal_max_seen: 0,
             commit_bar: 0,
             exec_bar: 0,
-            log_offset: 0,
+            wal_offset: 0,
             snap_offset: 0,
             rs_coder,
         })
@@ -2703,8 +2719,8 @@ impl GenericReplica for CrosswordReplica {
         // recover state from durable snapshot file
         self.recover_from_snapshot().await?;
 
-        // recover the tail-piece memory log & state from durable storage log
-        self.recover_from_log().await?;
+        // recover the tail-piece memory log & state from durable WAL log
+        self.recover_from_wal().await?;
 
         // kick off leader activity hearing timer
         self.kickoff_hb_hear_timer()?;
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index aa42d2d1..fef9bf60 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -35,8 +35,8 @@ use tokio::sync::watch;
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
 pub struct ReplicaConfigMultiPaxos {
-    /// Client request batching interval in microsecs.
-    pub batch_interval_us: u64,
+    /// Client request batching interval in millisecs.
+    pub batch_interval_ms: u64,
 
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
@@ -73,7 +73,7 @@ pub struct ReplicaConfigMultiPaxos {
 impl Default for ReplicaConfigMultiPaxos {
     fn default() -> Self {
         ReplicaConfigMultiPaxos {
-            batch_interval_us: 1000,
+            batch_interval_ms: 10,
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.multipaxos.wal".into(),
             logger_sync: false,
@@ -153,12 +153,12 @@ struct Instance {
     external: bool,
 
     /// Offset of first durable WAL log entry related to this instance.
-    log_offset: usize,
+    wal_offset: usize,
 }
 
-/// Stable storage log entry type.
+/// Stable storage WAL log entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
-enum LogEntry {
+enum WalEntry {
     /// Records an update to the largest prepare ballot seen.
     PrepareBal { slot: usize, ballot: Ballot },
 
@@ -174,6 +174,10 @@ enum LogEntry {
 }
 
 /// Snapshot file entry type.
+///
+/// NOTE: the current implementation simply appends a squashed log at the
+/// end of the snapshot file for simplicity. In production, the snapshot
+/// file should be a bounded-sized backend, e.g., an LSM-tree.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum SnapEntry {
     /// Necessary slot indices to remember.
@@ -193,7 +197,13 @@ enum SnapEntry {
 #[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
 enum PeerMsg {
     /// Prepare message from leader to replicas.
-    Prepare { slot: usize, ballot: Ballot },
+    Prepare {
+        /// Slot index in Prepare message is the triggering slot of this
+        /// Prepare. Once prepared, it means that all slots in the range
+        /// [slot, +infinity) are prepared under this ballot number.
+        slot: usize,
+        ballot: Ballot,
+    },
 
     /// Prepare reply from replica to leader.
     PrepareReply {
@@ -251,7 +261,7 @@ pub struct MultiPaxosReplica {
     state_machine: StateMachine,
 
     /// StorageHub module.
-    storage_hub: StorageHub<LogEntry>,
+    storage_hub: StorageHub<WalEntry>,
 
     /// StorageHub module for the snapshot file.
     snapshot_hub: StorageHub<SnapEntry>,
@@ -300,8 +310,8 @@ pub struct MultiPaxosReplica {
     /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len()
     exec_bar: usize,
 
-    /// Current durable log file offset.
-    log_offset: usize,
+    /// Current durable WAL log file offset.
+    wal_offset: usize,
 
     /// Current durable snapshot file offset.
     snap_offset: usize,
@@ -326,7 +336,7 @@ impl MultiPaxosReplica {
             leader_bk: None,
             replica_bk: None,
             external: false,
-            log_offset: 0,
+            wal_offset: 0,
         }
     }
 
@@ -470,7 +480,7 @@ impl MultiPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Preparing),
                 LogAction::Append {
-                    entry: LogEntry::PrepareBal {
+                    entry: WalEntry::PrepareBal {
                         slot,
                         ballot: self.bal_prep_sent,
                     },
@@ -503,7 +513,7 @@ impl MultiPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
-                    entry: LogEntry::AcceptData {
+                    entry: WalEntry::AcceptData {
                         slot,
                         ballot: inst.bal,
                         reqs: req_batch.clone(),
@@ -660,7 +670,7 @@ impl MultiPaxosReplica {
     fn handle_log_result(
         &mut self,
         action_id: LogActionId,
-        log_result: LogResult<LogEntry>,
+        log_result: LogResult<WalEntry>,
     ) -> Result<(), SummersetError> {
         let (slot, entry_type) = Self::split_log_action_id(action_id);
         if slot < self.start_slot {
@@ -669,15 +679,15 @@ impl MultiPaxosReplica {
         assert!(slot < self.start_slot + self.insts.len());
 
         if let LogResult::Append { now_size } = log_result {
-            assert!(now_size >= self.log_offset);
-            // update first log_offset of slot
+            assert!(now_size >= self.wal_offset);
+            // update first wal_offset of slot
             let inst = &mut self.insts[slot - self.start_slot];
-            if inst.log_offset == 0 || inst.log_offset > self.log_offset {
-                inst.log_offset = self.log_offset;
+            if inst.wal_offset == 0 || inst.wal_offset > self.wal_offset {
+                inst.wal_offset = self.wal_offset;
             }
-            assert!(inst.log_offset <= self.log_offset);
-            // then update self.log_offset
-            self.log_offset = now_size;
+            assert!(inst.wal_offset <= self.wal_offset);
+            // then update self.wal_offset
+            self.wal_offset = now_size;
         } else {
             return logged_err!(self.id; "unexpected log result type: {:?}", log_result);
         }
@@ -728,7 +738,7 @@ impl MultiPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Preparing),
                 LogAction::Append {
-                    entry: LogEntry::PrepareBal { slot, ballot },
+                    entry: WalEntry::PrepareBal { slot, ballot },
                     sync: self.config.logger_sync,
                 },
             )?;
@@ -799,7 +809,7 @@ impl MultiPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Accepting),
                     LogAction::Append {
-                        entry: LogEntry::AcceptData {
+                        entry: WalEntry::AcceptData {
                             slot,
                             ballot,
                             reqs: inst.reqs.clone(),
@@ -863,7 +873,7 @@ impl MultiPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
-                    entry: LogEntry::AcceptData { slot, ballot, reqs },
+                    entry: WalEntry::AcceptData { slot, ballot, reqs },
                     sync: self.config.logger_sync,
                 },
             )?;
@@ -921,7 +931,7 @@ impl MultiPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Committed),
                     LogAction::Append {
-                        entry: LogEntry::CommitSlot { slot },
+                        entry: WalEntry::CommitSlot { slot },
                         sync: self.config.logger_sync,
                     },
                 )?;
@@ -970,7 +980,7 @@ impl MultiPaxosReplica {
         self.storage_hub.submit_action(
             Self::make_log_action_id(slot, Status::Committed),
             LogAction::Append {
-                entry: LogEntry::CommitSlot { slot },
+                entry: WalEntry::CommitSlot { slot },
                 sync: self.config.logger_sync,
             },
         )?;
@@ -1123,7 +1133,7 @@ impl MultiPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Preparing),
                     LogAction::Append {
-                        entry: LogEntry::PrepareBal {
+                        entry: WalEntry::PrepareBal {
                             slot,
                             ballot: self.bal_prep_sent,
                         },
@@ -1381,10 +1391,10 @@ impl MultiPaxosReplica {
     /// Apply a durable storage log entry for recovery.
     async fn recover_apply_entry(
         &mut self,
-        entry: LogEntry,
+        entry: WalEntry,
     ) -> Result<(), SummersetError> {
         match entry {
-            LogEntry::PrepareBal { slot, ballot } => {
+            WalEntry::PrepareBal { slot, ballot } => {
                 if slot < self.start_slot {
                     return Ok(()); // ignore if slot index outdated
                 }
@@ -1406,7 +1416,7 @@ impl MultiPaxosReplica {
                 self.bal_prepared = 0;
             }
 
-            LogEntry::AcceptData { slot, ballot, reqs } => {
+            WalEntry::AcceptData { slot, ballot, reqs } => {
                 if slot < self.start_slot {
                     return Ok(()); // ignore if slot index outdated
                 }
@@ -1435,7 +1445,7 @@ impl MultiPaxosReplica {
                 assert!(self.bal_prepared <= self.bal_prep_sent);
             }
 
-            LogEntry::CommitSlot { slot } => {
+            WalEntry::CommitSlot { slot } => {
                 if slot < self.start_slot {
                     return Ok(()); // ignore if slot index outdated
                 }
@@ -1472,15 +1482,15 @@ impl MultiPaxosReplica {
         Ok(())
     }
 
-    /// Recover state from durable storage log.
-    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
-        assert_eq!(self.log_offset, 0);
+    /// Recover state from durable storage WAL log.
+    async fn recover_from_wal(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.wal_offset, 0);
         loop {
             // using 0 as a special log action ID
             self.storage_hub.submit_action(
                 0,
                 LogAction::Read {
-                    offset: self.log_offset,
+                    offset: self.wal_offset,
                 },
             )?;
             let (_, log_result) = self.storage_hub.get_result().await?;
@@ -1492,7 +1502,7 @@ impl MultiPaxosReplica {
                 } => {
                     self.recover_apply_entry(entry).await?;
                     // update log offset
-                    self.log_offset = end_offset;
+                    self.wal_offset = end_offset;
                 }
                 LogResult::Read { entry: None, .. } => {
                     // end of log reached
@@ -1508,7 +1518,7 @@ impl MultiPaxosReplica {
         self.storage_hub.submit_action(
             0,
             LogAction::Truncate {
-                offset: self.log_offset,
+                offset: self.wal_offset,
             },
         )?;
         let (_, log_result) = self.storage_hub.get_result().await?;
@@ -1516,7 +1526,7 @@ impl MultiPaxosReplica {
             offset_ok: true, ..
         } = log_result
         {
-            if self.log_offset > 0 {
+            if self.wal_offset > 0 {
                 pf_info!(self.id; "recovered from wal log: commit {} exec {}",
                                   self.commit_bar, self.exec_bar);
             }
@@ -1529,7 +1539,7 @@ impl MultiPaxosReplica {
 
 // MultiPaxosReplica snapshotting & GC logic
 impl MultiPaxosReplica {
-    /// Dump a new key-value pair to snapshot file.
+    /// Dump new key-value pairs to snapshot file.
     async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
         let mut pairs = HashMap::new();
@@ -1569,9 +1579,9 @@ impl MultiPaxosReplica {
     /// Discard everything older than start_slot in durable WAL log.
     async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> {
         let cut_offset = if !self.insts.is_empty() {
-            self.insts[0].log_offset
+            self.insts[0].wal_offset
         } else {
-            self.log_offset
+            self.wal_offset
         };
 
         // discard the log before cut_offset
@@ -1590,8 +1600,8 @@ impl MultiPaxosReplica {
                         now_size,
                     } = log_result
                     {
-                        assert_eq!(self.log_offset - cut_offset, now_size);
-                        self.log_offset = now_size;
+                        assert_eq!(self.wal_offset - cut_offset, now_size);
+                        self.wal_offset = now_size;
                     } else {
                         return logged_err!(
                             self.id;
@@ -1603,11 +1613,11 @@ impl MultiPaxosReplica {
             }
         }
 
-        // update inst.log_offset for all remaining in-mem instances
+        // update inst.wal_offset for all remaining in-mem instances
         for inst in &mut self.insts {
-            if inst.log_offset > 0 {
-                assert!(inst.log_offset >= cut_offset);
-                inst.log_offset -= cut_offset;
+            if inst.wal_offset > 0 {
+                assert!(inst.wal_offset >= cut_offset);
+                inst.wal_offset -= cut_offset;
             }
         }
 
@@ -1620,6 +1630,12 @@ impl MultiPaxosReplica {
     /// NOTE: the current implementation does not guard against crashes in the
     /// middle of taking a snapshot. Production quality implementations should
     /// make the snapshotting action "atomic".
+    ///
+    /// NOTE: the current implementation does not take care of InstallSnapshot
+    /// messages (which is needed when some lagging follower has some slot
+    /// which all other peers have snapshotted); we assume here that failed
+    /// Accept messages will be retried indefinitely until success before its
+    /// associated data gets discarded from leader's memory.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
         pf_debug!(self.id; "taking new snapshot: start {} exec {}",
                            self.start_slot, self.exec_bar);
@@ -1798,18 +1814,18 @@ impl GenericReplica for MultiPaxosReplica {
 
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigMultiPaxos;
-                                    batch_interval_us, max_batch_size,
+                                    batch_interval_ms, max_batch_size,
                                     backer_path, logger_sync,
                                     hb_hear_timeout_min, hb_hear_timeout_max,
                                     hb_send_interval_ms,
                                     snapshot_path, snapshot_interval_s,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
-        if config.batch_interval_us == 0 {
+        if config.batch_interval_ms == 0 {
             return logged_err!(
                 id;
-                "invalid config.batch_interval_us '{}'",
-                config.batch_interval_us
+                "invalid config.batch_interval_ms '{}'",
+                config.batch_interval_ms
             );
         }
         if config.hb_hear_timeout_min < 100 {
@@ -1898,7 +1914,7 @@ impl GenericReplica for MultiPaxosReplica {
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
-            Duration::from_micros(config.batch_interval_us),
+            Duration::from_millis(config.batch_interval_ms),
             config.max_batch_size,
         )
         .await?;
@@ -1946,7 +1962,7 @@ impl GenericReplica for MultiPaxosReplica {
             bal_max_seen: 0,
             commit_bar: 0,
             exec_bar: 0,
-            log_offset: 0,
+            wal_offset: 0,
             snap_offset: 0,
         })
     }
@@ -1958,8 +1974,8 @@ impl GenericReplica for MultiPaxosReplica {
         // recover state from durable snapshot file
         self.recover_from_snapshot().await?;
 
-        // recover the tail-piece memory log & state from durable storage log
-        self.recover_from_log().await?;
+        // recover the tail-piece memory log & state from durable WAL log
+        self.recover_from_wal().await?;
 
         // kick off leader activity hearing timer
         self.kickoff_hb_hear_timer()?;
diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs
index f4a60f47..4d6b2408 100644
--- a/src/protocols/raft.rs
+++ b/src/protocols/raft.rs
@@ -1,16 +1,40 @@
 //! Replication protocol: Raft.
 //!
-//! References:
+//! ATC '14 version of Raft. References:
 //!   - <https://raft.github.io/raft.pdf>
 //!   - <https://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf>
+//!   - <https://decentralizedthoughts.github.io/2020-12-12-raft-liveness-full-omission/>
+
+use std::collections::HashMap;
+use std::path::Path;
+use std::net::SocketAddr;
 
 use crate::utils::{SummersetError, Bitmap, Timer};
+use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply};
+use crate::server::{
+    ReplicaId, ControlHub, StateMachine, Command, CommandResult, CommandId,
+    ExternalApi, ApiRequest, ApiReply, StorageHub, LogAction, LogResult,
+    LogActionId, TransportHub, GenericReplica,
+};
+use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint};
+use crate::protocols::SmrProtocol;
+
+use rand::prelude::*;
+
+use async_trait::async_trait;
+
+use get_size::GetSize;
+
+use serde::{Serialize, Deserialize};
+
+use tokio::time::{self, Duration, Interval, MissedTickBehavior};
+use tokio::sync::watch;
 
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
 pub struct ReplicaConfigRaft {
-    /// Client request batching interval in microsecs.
-    pub batch_interval_us: u64,
+    /// Client request batching interval in millisecs.
+    pub batch_interval_ms: u64,
 
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
@@ -21,6 +45,21 @@ pub struct ReplicaConfigRaft {
     /// Whether to call `fsync()`/`fdatasync()` on logger.
     pub logger_sync: bool,
 
+    /// Min timeout of not hearing any heartbeat from leader in millisecs.
+    pub hb_hear_timeout_min: u64,
+    /// Max timeout of not hearing any heartbeat from leader in millisecs.
+    pub hb_hear_timeout_max: u64,
+
+    /// Interval of leader sending AppendEntries heartbeats to followers.
+    pub hb_send_interval_ms: u64,
+
+    /// Path to snapshot file.
+    pub snapshot_path: String,
+
+    /// Snapshot self-triggering interval in secs. 0 means never trigger
+    /// snapshotting autonomously.
+    pub snapshot_interval_s: u64,
+
     // Performance simulation params (all zeros means no perf simulation):
     pub perf_storage_a: u64,
     pub perf_storage_b: u64,
@@ -32,10 +71,15 @@ pub struct ReplicaConfigRaft {
 impl Default for ReplicaConfigRaft {
     fn default() -> Self {
         ReplicaConfigRaft {
-            batch_interval_us: 1000,
+            batch_interval_ms: 10,
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.raft.wal".into(),
             logger_sync: false,
+            hb_hear_timeout_min: 600,
+            hb_hear_timeout_max: 900,
+            hb_send_interval_ms: 50,
+            snapshot_path: "/tmp/summerset.multipaxos.snap".into(),
+            snapshot_interval_s: 0,
             perf_storage_a: 0,
             perf_storage_b: 0,
             perf_network_a: 0,
@@ -44,6 +88,85 @@ impl Default for ReplicaConfigRaft {
     }
 }
 
+/// Term number type, defined for better code readability.
+type Term = u64;
+
+/// Request batch type (i.e., the "command" in an entry).
+///
+/// NOTE: the originally presented Raft algorithm does not explicitly mention
+/// batching, but instead hides it with the heartbeats: every AppendEntries RPC
+/// from the leader basically batches all commands it has received since the
+/// last sent heartbeat. Here, to make this implementation more comparable to
+/// MultiPaxos, we trigger batching also explicitly.
+type ReqBatch = Vec<(ClientId, ApiRequest)>;
+
+/// In-mem + persistent entry of log, containing a term and a commands batch.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
+struct LogEntry {
+    /// Term number.
+    term: Term,
+
+    /// Batch of client requests.
+    reqs: ReqBatch,
+}
+
+/// Stable storage log entry type.
+///
+/// NOTE: Raft makes the persistent log exactly mirror the in-memory log, so
+/// the backer file is not a WAL log in runtime operation; it might get
+/// overwritten, etc.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
+enum DurEntry {
+    /// Durable metadata.
+    Metadata {
+        curr_term: Term,
+        voted_for: ReplicaId,
+    },
+
+    /// Log entry mirroring in-mem log.
+    LogEntry { entry: LogEntry },
+}
+
+/// Snapshot file entry type.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
+enum SnapEntry {
+    /// Necessary slot indices to remember.
+    SlotInfo {
+        /// First entry at the start of file: number of log entries covered
+        /// by this snapshot file == the start slot index of remaining log.
+        start_slot: usize,
+    },
+
+    /// Set of key-value pairs to apply to the state.
+    KVPairSet { pairs: HashMap<String, String> },
+}
+
+/// Peer-peer message type.
+#[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
+enum PeerMsg {
+    /// AppendEntries from leader to followers.
+    AppendEntries {
+        term: Term,
+        prev_slot: usize,
+        prev_term: Term,
+        entries: Vec<LogEntry>,
+        leader_commit: usize,
+    },
+
+    /// AppendEntries reply from follower to leader.
+    AppendEntriesReply { term: Term, success: bool },
+
+    /// RequestVote from leader to followers.
+    RequestVote {
+        term: Term,
+        last_slot: usize,
+        last_term: Term,
+    },
+
+    /// RequestVote reply from follower to leader.
+    RequestVoteReply { term: Term, granted: bool },
+}
+
 /// Raft server replica module.
 pub struct RaftReplica {
     /// Replica ID in cluster.
@@ -57,6 +180,73 @@ pub struct RaftReplica {
 
     /// Configuration parameters struct.
     config: ReplicaConfigRaft,
+
+    /// Address string for client requests API.
+    _api_addr: SocketAddr,
+
+    /// Address string for internal peer-peer communication.
+    _p2p_addr: SocketAddr,
+
+    /// ControlHub module.
+    control_hub: ControlHub,
+
+    /// ExternalApi module.
+    external_api: ExternalApi,
+
+    /// StateMachine module.
+    state_machine: StateMachine,
+
+    /// StorageHub module.
+    storage_hub: StorageHub<DurEntry>,
+
+    /// StorageHub module for the snapshot file.
+    snapshot_hub: StorageHub<SnapEntry>,
+
+    /// TransportHub module.
+    transport_hub: TransportHub<PeerMsg>,
+
+    /// Who do I think is the effective leader of the cluster right now?
+    leader: Option<ReplicaId>,
+
+    /// Timer for hearing heartbeat from leader.
+    hb_hear_timer: Timer,
+
+    /// Interval for sending heartbeat to followers.
+    hb_send_interval: Interval,
+
+    /// Heartbeat reply counters for approximate detection of follower health.
+    /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition).
+    hb_reply_cnts: HashMap<ReplicaId, (u64, u64, u8)>,
+
+    /// Approximate health status tracking of peer replicas.
+    peer_alive: Bitmap,
+
+    /// Latest term seen.
+    curr_term: Term,
+
+    /// Candidate ID that received vote in current term.
+    voted_for: ReplicaId,
+
+    /// In-memory log of entries.
+    log: Vec<LogEntry>,
+
+    /// Map from in-mem log entry slot index -> offset in durable backer file.
+    log_offset: Vec<usize>,
+
+    /// Slot index of highest log entry known to be committed.
+    commit_bar: usize,
+
+    /// Slot index of highest log entry applied to state machine.
+    exec_bar: usize,
+
+    /// For each server, index of the next log entry to send.
+    next_slot: HashMap<ReplicaId, usize>,
+
+    /// For each server, index of the highest log entry known to be replicated.
+    match_slot: HashMap<ReplicaId, usize>,
+
+    /// Current durable snapshot file offset.
+    snap_offset: usize,
 }
 
 #[async_trait]
diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs
index af46cc69..a14af95b 100644
--- a/src/protocols/rep_nothing.rs
+++ b/src/protocols/rep_nothing.rs
@@ -28,8 +28,8 @@ use tokio::sync::watch;
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
 pub struct ReplicaConfigRepNothing {
-    /// Client request batching interval in microsecs.
-    pub batch_interval_us: u64,
+    /// Client request batching interval in millisecs.
+    pub batch_interval_ms: u64,
 
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
@@ -49,7 +49,7 @@ pub struct ReplicaConfigRepNothing {
 impl Default for ReplicaConfigRepNothing {
     fn default() -> Self {
         ReplicaConfigRepNothing {
-            batch_interval_us: 1000,
+            batch_interval_ms: 10,
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.rep_nothing.wal".into(),
             logger_sync: false,
@@ -59,9 +59,9 @@ impl Default for ReplicaConfigRepNothing {
     }
 }
 
-/// Log entry type.
+/// WAL log entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
-struct LogEntry {
+struct WalEntry {
     reqs: Vec<(ClientId, ApiRequest)>,
 }
 
@@ -97,13 +97,13 @@ pub struct RepNothingReplica {
     state_machine: StateMachine,
 
     /// StorageHub module.
-    storage_hub: StorageHub<LogEntry>,
+    storage_hub: StorageHub<WalEntry>,
 
     /// In-memory log of instances.
     insts: Vec<Instance>,
 
-    /// Current durable log file offset.
-    log_offset: usize,
+    /// Current durable WAL log file offset.
+    wal_offset: usize,
 }
 
 // RepNothingReplica common helpers
@@ -144,11 +144,11 @@ impl RepNothingReplica {
         self.insts.push(inst);
 
         // submit log action to make this instance durable
-        let log_entry = LogEntry { reqs: req_batch };
+        let wal_entry = WalEntry { reqs: req_batch };
         self.storage_hub.submit_action(
             inst_idx as LogActionId,
             LogAction::Append {
-                entry: log_entry,
+                entry: wal_entry,
                 sync: self.config.logger_sync,
             },
         )?;
@@ -163,7 +163,7 @@ impl RepNothingReplica {
     fn handle_log_result(
         &mut self,
         action_id: LogActionId,
-        log_result: LogResult<LogEntry>,
+        log_result: LogResult<WalEntry>,
     ) -> Result<(), SummersetError> {
         let inst_idx = action_id as usize;
         if inst_idx >= self.insts.len() {
@@ -172,8 +172,8 @@ impl RepNothingReplica {
 
         match log_result {
             LogResult::Append { now_size } => {
-                assert!(now_size >= self.log_offset);
-                self.log_offset = now_size;
+                assert!(now_size >= self.wal_offset);
+                self.wal_offset = now_size;
             }
             _ => {
                 return logged_err!(self.id; "unexpected log result type for {}: {:?}", inst_idx, log_result);
@@ -340,15 +340,15 @@ impl RepNothingReplica {
 
 // RepNothingReplica recovery from WAL log
 impl RepNothingReplica {
-    /// Recover state from durable storage log.
-    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
-        assert_eq!(self.log_offset, 0);
+    /// Recover state from durable storage WAL log.
+    async fn recover_from_wal(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.wal_offset, 0);
         loop {
             // using 0 as a special log action ID
             self.storage_hub.submit_action(
                 0,
                 LogAction::Read {
-                    offset: self.log_offset,
+                    offset: self.wal_offset,
                 },
             )?;
             let (_, log_result) = self.storage_hub.get_result().await?;
@@ -374,7 +374,7 @@ impl RepNothingReplica {
                         execed: vec![true; num_reqs],
                     });
                     // update log offset
-                    self.log_offset = end_offset;
+                    self.wal_offset = end_offset;
                 }
                 LogResult::Read { entry: None, .. } => {
                     // end of log reached
@@ -390,7 +390,7 @@ impl RepNothingReplica {
         self.storage_hub.submit_action(
             0,
             LogAction::Truncate {
-                offset: self.log_offset,
+                offset: self.wal_offset,
             },
         )?;
         let (_, log_result) = self.storage_hub.get_result().await?;
@@ -419,14 +419,14 @@ impl GenericReplica for RepNothingReplica {
 
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigRepNothing;
-                                    batch_interval_us, max_batch_size,
+                                    batch_interval_ms, max_batch_size,
                                     backer_path, logger_sync,
                                     perf_storage_a, perf_storage_b)?;
-        if config.batch_interval_us == 0 {
+        if config.batch_interval_ms == 0 {
             return logged_err!(
                 id;
-                "invalid config.batch_interval_us '{}'",
-                config.batch_interval_us
+                "invalid config.batch_interval_ms '{}'",
+                config.batch_interval_ms
             );
         }
 
@@ -460,7 +460,7 @@ impl GenericReplica for RepNothingReplica {
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
-            Duration::from_micros(config.batch_interval_us),
+            Duration::from_millis(config.batch_interval_ms),
             config.max_batch_size,
         )
         .await?;
@@ -475,7 +475,7 @@ impl GenericReplica for RepNothingReplica {
             state_machine,
             storage_hub,
             insts: vec![],
-            log_offset: 0,
+            wal_offset: 0,
         })
     }
 
@@ -483,8 +483,8 @@ impl GenericReplica for RepNothingReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
-        // recover state from durable storage log
-        self.recover_from_log().await?;
+        // recover state from durable storage WAL log
+        self.recover_from_wal().await?;
 
         // main event loop
         let mut paused = false;
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 750ec48f..3280baf1 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -33,8 +33,8 @@ use reed_solomon_erasure::galois_8::ReedSolomon;
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
 pub struct ReplicaConfigRSPaxos {
-    /// Client request batching interval in microsecs.
-    pub batch_interval_us: u64,
+    /// Client request batching interval in millisecs.
+    pub batch_interval_ms: u64,
 
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
@@ -77,7 +77,7 @@ pub struct ReplicaConfigRSPaxos {
 impl Default for ReplicaConfigRSPaxos {
     fn default() -> Self {
         ReplicaConfigRSPaxos {
-            batch_interval_us: 1000,
+            batch_interval_ms: 10,
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.rs_paxos.wal".into(),
             logger_sync: false,
@@ -159,12 +159,12 @@ struct Instance {
     external: bool,
 
     /// Offset of first durable WAL log entry related to this instance.
-    log_offset: usize,
+    wal_offset: usize,
 }
 
-/// Stable storage log entry type.
+/// Stable storage WAL log entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
-enum LogEntry {
+enum WalEntry {
     /// Records an update to the largest prepare ballot seen.
     PrepareBal { slot: usize, ballot: Ballot },
 
@@ -180,6 +180,10 @@ enum LogEntry {
 }
 
 /// Snapshot file entry type.
+///
+/// NOTE: the current implementation simply appends a squashed log at the
+/// end of the snapshot file for simplicity. In production, the snapshot
+/// file should be a bounded-sized backend, e.g., an LSM-tree.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
 enum SnapEntry {
     /// Necessary slot indices to remember.
@@ -199,7 +203,13 @@ enum SnapEntry {
 #[derive(Debug, Clone, Serialize, Deserialize, GetSize)]
 enum PeerMsg {
     /// Prepare message from leader to replicas.
-    Prepare { slot: usize, ballot: Ballot },
+    Prepare {
+        /// Slot index in Prepare message is the triggering slot of this
+        /// Prepare. Once prepared, it means that all slots in the range
+        /// [slot, +infinity) are prepared under this ballot number.
+        slot: usize,
+        ballot: Ballot,
+    },
 
     /// Prepare reply from replica to leader.
     PrepareReply {
@@ -266,7 +276,7 @@ pub struct RSPaxosReplica {
     state_machine: StateMachine,
 
     /// StorageHub module.
-    storage_hub: StorageHub<LogEntry>,
+    storage_hub: StorageHub<WalEntry>,
 
     /// StorageHub module for the snapshot file.
     snapshot_hub: StorageHub<SnapEntry>,
@@ -315,8 +325,8 @@ pub struct RSPaxosReplica {
     /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len()
     exec_bar: usize,
 
-    /// Current durable log file offset.
-    log_offset: usize,
+    /// Current durable WAL log file offset.
+    wal_offset: usize,
 
     /// Current durable snapshot file offset.
     snap_offset: usize,
@@ -353,7 +363,7 @@ impl RSPaxosReplica {
             leader_bk: None,
             replica_bk: None,
             external: false,
-            log_offset: 0,
+            wal_offset: 0,
         })
     }
 
@@ -505,7 +515,7 @@ impl RSPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Preparing),
                 LogAction::Append {
-                    entry: LogEntry::PrepareBal {
+                    entry: WalEntry::PrepareBal {
                         slot,
                         ballot: self.bal_prep_sent,
                     },
@@ -542,7 +552,7 @@ impl RSPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
-                    entry: LogEntry::AcceptData {
+                    entry: WalEntry::AcceptData {
                         slot,
                         ballot: inst.bal,
                         // persist only one shard on myself
@@ -719,7 +729,7 @@ impl RSPaxosReplica {
     fn handle_log_result(
         &mut self,
         action_id: LogActionId,
-        log_result: LogResult<LogEntry>,
+        log_result: LogResult<WalEntry>,
     ) -> Result<(), SummersetError> {
         let (slot, entry_type) = Self::split_log_action_id(action_id);
         if slot < self.start_slot {
@@ -728,15 +738,15 @@ impl RSPaxosReplica {
         assert!(slot < self.start_slot + self.insts.len());
 
         if let LogResult::Append { now_size } = log_result {
-            assert!(now_size >= self.log_offset);
-            // update first log_offset of slot
+            assert!(now_size >= self.wal_offset);
+            // update first wal_offset of slot
             let inst = &mut self.insts[slot - self.start_slot];
-            if inst.log_offset == 0 || inst.log_offset > self.log_offset {
-                inst.log_offset = self.log_offset;
+            if inst.wal_offset == 0 || inst.wal_offset > self.wal_offset {
+                inst.wal_offset = self.wal_offset;
             }
-            assert!(inst.log_offset <= self.log_offset);
-            // then update self.log_offset
-            self.log_offset = now_size;
+            assert!(inst.wal_offset <= self.wal_offset);
+            // then update self.wal_offset
+            self.wal_offset = now_size;
         } else {
             return logged_err!(self.id; "unexpected log result type: {:?}", log_result);
         }
@@ -787,7 +797,7 @@ impl RSPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Preparing),
                 LogAction::Append {
-                    entry: LogEntry::PrepareBal { slot, ballot },
+                    entry: WalEntry::PrepareBal { slot, ballot },
                     sync: self.config.logger_sync,
                 },
             )?;
@@ -884,7 +894,7 @@ impl RSPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Accepting),
                     LogAction::Append {
-                        entry: LogEntry::AcceptData {
+                        entry: WalEntry::AcceptData {
                             slot,
                             ballot,
                             reqs_cw: subset_copy,
@@ -956,7 +966,7 @@ impl RSPaxosReplica {
             self.storage_hub.submit_action(
                 Self::make_log_action_id(slot, Status::Accepting),
                 LogAction::Append {
-                    entry: LogEntry::AcceptData {
+                    entry: WalEntry::AcceptData {
                         slot,
                         ballot,
                         reqs_cw: inst.reqs_cw.clone(),
@@ -1022,7 +1032,7 @@ impl RSPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Committed),
                     LogAction::Append {
-                        entry: LogEntry::CommitSlot { slot },
+                        entry: WalEntry::CommitSlot { slot },
                         sync: self.config.logger_sync,
                     },
                 )?;
@@ -1071,7 +1081,7 @@ impl RSPaxosReplica {
         self.storage_hub.submit_action(
             Self::make_log_action_id(slot, Status::Committed),
             LogAction::Append {
-                entry: LogEntry::CommitSlot { slot },
+                entry: WalEntry::CommitSlot { slot },
                 sync: self.config.logger_sync,
             },
         )?;
@@ -1345,7 +1355,7 @@ impl RSPaxosReplica {
                 self.storage_hub.submit_action(
                     Self::make_log_action_id(slot, Status::Preparing),
                     LogAction::Append {
-                        entry: LogEntry::PrepareBal {
+                        entry: WalEntry::PrepareBal {
                             slot,
                             ballot: self.bal_prep_sent,
                         },
@@ -1619,10 +1629,10 @@ impl RSPaxosReplica {
     /// Apply a durable storage log entry for recovery.
     async fn recover_apply_entry(
         &mut self,
-        entry: LogEntry,
+        entry: WalEntry,
     ) -> Result<(), SummersetError> {
         match entry {
-            LogEntry::PrepareBal { slot, ballot } => {
+            WalEntry::PrepareBal { slot, ballot } => {
                 if slot < self.start_slot {
                     return Ok(()); // ignore if slot index outdated
                 }
@@ -1644,7 +1654,7 @@ impl RSPaxosReplica {
                 self.bal_prepared = 0;
             }
 
-            LogEntry::AcceptData {
+            WalEntry::AcceptData {
                 slot,
                 ballot,
                 reqs_cw,
@@ -1677,7 +1687,7 @@ impl RSPaxosReplica {
                 assert!(self.bal_prepared <= self.bal_prep_sent);
             }
 
-            LogEntry::CommitSlot { slot } => {
+            WalEntry::CommitSlot { slot } => {
                 if slot < self.start_slot {
                     return Ok(()); // ignore if slot index outdated
                 }
@@ -1726,15 +1736,15 @@ impl RSPaxosReplica {
         Ok(())
     }
 
-    /// Recover state from durable storage log.
-    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
-        assert_eq!(self.log_offset, 0);
+    /// Recover state from durable storage WAL log.
+    async fn recover_from_wal(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.wal_offset, 0);
         loop {
             // using 0 as a special log action ID
             self.storage_hub.submit_action(
                 0,
                 LogAction::Read {
-                    offset: self.log_offset,
+                    offset: self.wal_offset,
                 },
             )?;
             let (_, log_result) = self.storage_hub.get_result().await?;
@@ -1746,7 +1756,7 @@ impl RSPaxosReplica {
                 } => {
                     self.recover_apply_entry(entry).await?;
                     // update log offset
-                    self.log_offset = end_offset;
+                    self.wal_offset = end_offset;
                 }
                 LogResult::Read { entry: None, .. } => {
                     // end of log reached
@@ -1762,7 +1772,7 @@ impl RSPaxosReplica {
         self.storage_hub.submit_action(
             0,
             LogAction::Truncate {
-                offset: self.log_offset,
+                offset: self.wal_offset,
             },
         )?;
         let (_, log_result) = self.storage_hub.get_result().await?;
@@ -1770,7 +1780,7 @@ impl RSPaxosReplica {
             offset_ok: true, ..
         } = log_result
         {
-            if self.log_offset > 0 {
+            if self.wal_offset > 0 {
                 pf_info!(self.id; "recovered from wal log: commit {} exec {}",
                                   self.commit_bar, self.exec_bar);
             }
@@ -1783,7 +1793,7 @@ impl RSPaxosReplica {
 
 // RSPaxosReplica snapshotting & GC logic
 impl RSPaxosReplica {
-    /// Dump a new key-value pair to snapshot file.
+    /// Dump new key-value pairs to snapshot file.
     async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
         let mut pairs = HashMap::new();
@@ -1824,9 +1834,9 @@ impl RSPaxosReplica {
     /// Discard everything older than start_slot in durable WAL log.
     async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> {
         let cut_offset = if !self.insts.is_empty() {
-            self.insts[0].log_offset
+            self.insts[0].wal_offset
         } else {
-            self.log_offset
+            self.wal_offset
         };
 
         // discard the log before cut_offset
@@ -1845,8 +1855,8 @@ impl RSPaxosReplica {
                         now_size,
                     } = log_result
                     {
-                        assert_eq!(self.log_offset - cut_offset, now_size);
-                        self.log_offset = now_size;
+                        assert_eq!(self.wal_offset - cut_offset, now_size);
+                        self.wal_offset = now_size;
                     } else {
                         return logged_err!(
                             self.id;
@@ -1858,11 +1868,11 @@ impl RSPaxosReplica {
             }
         }
 
-        // update inst.log_offset for all remaining in-mem instances
+        // update inst.wal_offset for all remaining in-mem instances
         for inst in &mut self.insts {
-            if inst.log_offset > 0 {
-                assert!(inst.log_offset >= cut_offset);
-                inst.log_offset -= cut_offset;
+            if inst.wal_offset > 0 {
+                assert!(inst.wal_offset >= cut_offset);
+                inst.wal_offset -= cut_offset;
             }
         }
 
@@ -1875,6 +1885,12 @@ impl RSPaxosReplica {
     /// NOTE: the current implementation does not guard against crashes in the
     /// middle of taking a snapshot. Production quality implementations should
     /// make the snapshotting action "atomic".
+    ///
+    /// NOTE: the current implementation does not take care of InstallSnapshot
+    /// messages (which is needed when some lagging follower has some slot
+    /// which all other peers have snapshotted); we assume here that failed
+    /// Accept messages will be retried indefinitely until success before its
+    /// associated data gets discarded from leader's memory.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
         pf_debug!(self.id; "taking new snapshot: start {} exec {}",
                            self.start_slot, self.exec_bar);
@@ -2053,7 +2069,7 @@ impl GenericReplica for RSPaxosReplica {
 
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigRSPaxos;
-                                    batch_interval_us, max_batch_size,
+                                    batch_interval_ms, max_batch_size,
                                     backer_path, logger_sync,
                                     hb_hear_timeout_min, hb_hear_timeout_max,
                                     hb_send_interval_ms,
@@ -2061,11 +2077,11 @@ impl GenericReplica for RSPaxosReplica {
                                     fault_tolerance, recon_chunk_size,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
-        if config.batch_interval_us == 0 {
+        if config.batch_interval_ms == 0 {
             return logged_err!(
                 id;
-                "invalid config.batch_interval_us '{}'",
-                config.batch_interval_us
+                "invalid config.batch_interval_ms '{}'",
+                config.batch_interval_ms
             );
         }
         if config.hb_hear_timeout_min < 100 {
@@ -2173,7 +2189,7 @@ impl GenericReplica for RSPaxosReplica {
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
-            Duration::from_micros(config.batch_interval_us),
+            Duration::from_millis(config.batch_interval_ms),
             config.max_batch_size,
         )
         .await?;
@@ -2221,7 +2237,7 @@ impl GenericReplica for RSPaxosReplica {
             bal_max_seen: 0,
             commit_bar: 0,
             exec_bar: 0,
-            log_offset: 0,
+            wal_offset: 0,
             snap_offset: 0,
             rs_coder,
         })
@@ -2234,8 +2250,8 @@ impl GenericReplica for RSPaxosReplica {
         // recover state from durable snapshot file
         self.recover_from_snapshot().await?;
 
-        // recover the tail-piece memory log & state from durable storage log
-        self.recover_from_log().await?;
+        // recover the tail-piece memory log & state from durable WAL log
+        self.recover_from_wal().await?;
 
         // kick off leader activity hearing timer
         self.kickoff_hb_hear_timer()?;
diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs
index a0345d7e..93baeb0c 100644
--- a/src/protocols/simple_push.rs
+++ b/src/protocols/simple_push.rs
@@ -29,8 +29,8 @@ use tokio::sync::watch;
 /// Configuration parameters struct.
 #[derive(Debug, Deserialize)]
 pub struct ReplicaConfigSimplePush {
-    /// Client request batching interval in microsecs.
-    pub batch_interval_us: u64,
+    /// Client request batching interval in millisecs.
+    pub batch_interval_ms: u64,
 
     /// Client request batching maximum batch size.
     pub max_batch_size: usize,
@@ -52,7 +52,7 @@ pub struct ReplicaConfigSimplePush {
 impl Default for ReplicaConfigSimplePush {
     fn default() -> Self {
         ReplicaConfigSimplePush {
-            batch_interval_us: 1000,
+            batch_interval_ms: 10,
             max_batch_size: 5000,
             backer_path: "/tmp/summerset.simple_push.wal".into(),
             rep_degree: 2,
@@ -64,9 +64,9 @@ impl Default for ReplicaConfigSimplePush {
     }
 }
 
-/// Log entry type.
+/// WAL log entry type.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)]
-enum LogEntry {
+enum WalEntry {
     FromClient {
         reqs: Vec<(ClientId, ApiRequest)>,
     },
@@ -126,7 +126,7 @@ pub struct SimplePushReplica {
     state_machine: StateMachine,
 
     /// StorageHub module.
-    storage_hub: StorageHub<LogEntry>,
+    storage_hub: StorageHub<WalEntry>,
 
     /// TransportHub module.
     transport_hub: TransportHub<PushMsg>,
@@ -134,8 +134,8 @@ pub struct SimplePushReplica {
     /// In-memory log of instances.
     insts: Vec<Instance>,
 
-    /// Current durable log file offset.
-    log_offset: usize,
+    /// Current durable WAL log file offset.
+    wal_offset: usize,
 }
 
 // SimplePushReplica common helpers
@@ -192,13 +192,13 @@ impl SimplePushReplica {
         self.insts.push(inst);
 
         // submit log action to make this instance durable
-        let log_entry = LogEntry::FromClient {
+        let wal_entry = WalEntry::FromClient {
             reqs: req_batch.clone(),
         };
         self.storage_hub.submit_action(
             inst_idx as LogActionId,
             LogAction::Append {
-                entry: log_entry,
+                entry: wal_entry,
                 sync: true,
             },
         )?;
@@ -222,7 +222,7 @@ impl SimplePushReplica {
     fn handle_log_result(
         &mut self,
         action_id: LogActionId,
-        log_result: LogResult<LogEntry>,
+        log_result: LogResult<WalEntry>,
     ) -> Result<(), SummersetError> {
         let inst_idx = action_id as usize;
         if inst_idx >= self.insts.len() {
@@ -231,8 +231,8 @@ impl SimplePushReplica {
 
         match log_result {
             LogResult::Append { now_size } => {
-                assert!(now_size >= self.log_offset);
-                self.log_offset = now_size;
+                assert!(now_size >= self.wal_offset);
+                self.wal_offset = now_size;
             }
             _ => {
                 return logged_err!(self.id; "unexpected log result type for {}: {:?}", inst_idx, log_result);
@@ -296,7 +296,7 @@ impl SimplePushReplica {
         self.insts.push(inst);
 
         // submit log action to make this instance durable
-        let log_entry = LogEntry::PeerPushed {
+        let wal_entry = WalEntry::PeerPushed {
             peer,
             src_inst_idx,
             reqs: req_batch.clone(),
@@ -304,7 +304,7 @@ impl SimplePushReplica {
         self.storage_hub.submit_action(
             inst_idx as LogActionId,
             LogAction::Append {
-                entry: log_entry,
+                entry: wal_entry,
                 sync: true,
             },
         )?;
@@ -508,15 +508,15 @@ impl SimplePushReplica {
 
 // SimplePushReplica recovery from WAL log
 impl SimplePushReplica {
-    /// Recover state from durable storage log.
-    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
-        assert_eq!(self.log_offset, 0);
+    /// Recover state from durable storage WAL log.
+    async fn recover_from_wal(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.wal_offset, 0);
         loop {
             // using 0 as a special log action ID
             self.storage_hub.submit_action(
                 0,
                 LogAction::Read {
-                    offset: self.log_offset,
+                    offset: self.wal_offset,
                 },
             )?;
             let (_, log_result) = self.storage_hub.get_result().await?;
@@ -527,8 +527,8 @@ impl SimplePushReplica {
                     end_offset,
                 } => {
                     let (from_peer, reqs) = match entry {
-                        LogEntry::FromClient { reqs } => (None, reqs),
-                        LogEntry::PeerPushed {
+                        WalEntry::FromClient { reqs } => (None, reqs),
+                        WalEntry::PeerPushed {
                             peer,
                             src_inst_idx,
                             reqs,
@@ -552,7 +552,7 @@ impl SimplePushReplica {
                         from_peer,
                     });
                     // update log offset
-                    self.log_offset = end_offset;
+                    self.wal_offset = end_offset;
                 }
                 LogResult::Read { entry: None, .. } => {
                     // end of log reached
@@ -568,7 +568,7 @@ impl SimplePushReplica {
         self.storage_hub.submit_action(
             0,
             LogAction::Truncate {
-                offset: self.log_offset,
+                offset: self.wal_offset,
             },
         )?;
         let (_, log_result) = self.storage_hub.get_result().await?;
@@ -598,15 +598,15 @@ impl GenericReplica for SimplePushReplica {
 
         // parse protocol-specific configs
         let config = parsed_config!(config_str => ReplicaConfigSimplePush;
-                                    batch_interval_us, max_batch_size,
+                                    batch_interval_ms, max_batch_size,
                                     backer_path, rep_degree,
                                     perf_storage_a, perf_storage_b,
                                     perf_network_a, perf_network_b)?;
-        if config.batch_interval_us == 0 {
+        if config.batch_interval_ms == 0 {
             return logged_err!(
                 id;
-                "invalid config.batch_interval_us '{}'",
-                config.batch_interval_us
+                "invalid config.batch_interval_ms '{}'",
+                config.batch_interval_ms
             );
         }
 
@@ -666,7 +666,7 @@ impl GenericReplica for SimplePushReplica {
         let external_api = ExternalApi::new_and_setup(
             id,
             api_addr,
-            Duration::from_micros(config.batch_interval_us),
+            Duration::from_millis(config.batch_interval_ms),
             config.max_batch_size,
         )
         .await?;
@@ -683,7 +683,7 @@ impl GenericReplica for SimplePushReplica {
             storage_hub,
             transport_hub,
             insts: vec![],
-            log_offset: 0,
+            wal_offset: 0,
         })
     }
 
@@ -691,8 +691,8 @@ impl GenericReplica for SimplePushReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
-        // recover state from durable storage log
-        self.recover_from_log().await?;
+        // recover state from durable storage WAL log
+        self.recover_from_wal().await?;
 
         // main event loop
         let mut paused = false;
diff --git a/src/server/transport.rs b/src/server/transport.rs
index e3b464f4..a91c44f4 100644
--- a/src/server/transport.rs
+++ b/src/server/transport.rs
@@ -1,9 +1,10 @@
 //! Summerset server internal TCP transport module implementation.
 //!
-//! In concept, all messages are sent through unstable communication channels,
-//! and are retried if the sender did not receive an ACK in a timely manner.
-//! Here, we use TCP as the communication protocol to get the same effect of
-//! "every message a sender wants to send will eventually be delivered".
+//! NOTE: In concept, all messages are sent through unstable communication
+//! channels, and are retried if the sender did not receive an ACK in a timely
+//! manner. Here, we use TCP as the communication protocol to get the same
+//! effect of "every message a sender wants to send will be retried until
+//! eventually delivered".
 
 use std::fmt;
 use std::net::SocketAddr;

From 7d69552d0f0ec37bbd609aa0dfa052771789bde6 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 8 Oct 2023 10:30:48 -0500
Subject: [PATCH 85/89] basically finished Raft impl

---
 README.md                   |    1 +
 src/protocols/crossword.rs  |    6 +-
 src/protocols/multipaxos.rs |    6 +-
 src/protocols/raft.rs       | 1680 ++++++++++++++++++++++++++++++++++-
 src/protocols/rs_paxos.rs   |    6 +-
 5 files changed, 1679 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index d968b5d5..89481b4a 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,7 @@ Summerset is a distributed, replicated, protocol-generic key-value store support
 | `SimplePush` | Pushing to peers w/o any consistency guarantees |
 | `MultiPaxos` | Classic [MultiPaxos](https://www.microsoft.com/en-us/research/uploads/prod/2016/12/paxos-simple-Copy.pdf) protocol |
 | `RS-Paxos` | MultiPaxos w/ Reed-Solomon erasure code sharding |
+| `Raft` | Explicit notion of log and strong leadership |
 
 Formal TLA+ specification of some protocols are provided in `tla+/`.
 
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 69028ce8..71942eb5 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -2338,7 +2338,7 @@ impl CrosswordReplica {
         Ok(())
     }
 
-    /// Take a snapshot up to current exec_idx, then discard the in-mem log up
+    /// Take a snapshot up to current exec_bar, then discard the in-mem log up
     /// to that index as well as outdate entries in the durable WAL log file.
     ///
     /// NOTE: the current implementation does not guard against crashes in the
@@ -2383,7 +2383,7 @@ impl CrosswordReplica {
                 offset_ok: true, ..
             } => {}
             _ => {
-                return logged_err!(self.id; "unexpected log result type or failed truncate");
+                return logged_err!(self.id; "unexpected log result type or failed write");
             }
         }
 
@@ -2502,7 +2502,7 @@ impl CrosswordReplica {
                     self.snap_offset = now_size;
                     Ok(())
                 } else {
-                    logged_err!(self.id; "unexpected log result type or failed truncate")
+                    logged_err!(self.id; "unexpected log result type or failed write")
                 }
             }
 
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index fef9bf60..a19e1d48 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -1624,7 +1624,7 @@ impl MultiPaxosReplica {
         Ok(())
     }
 
-    /// Take a snapshot up to current exec_idx, then discard the in-mem log up
+    /// Take a snapshot up to current exec_bar, then discard the in-mem log up
     /// to that index as well as outdate entries in the durable WAL log file.
     ///
     /// NOTE: the current implementation does not guard against crashes in the
@@ -1669,7 +1669,7 @@ impl MultiPaxosReplica {
                 offset_ok: true, ..
             } => {}
             _ => {
-                return logged_err!(self.id; "unexpected log result type or failed truncate");
+                return logged_err!(self.id; "unexpected log result type or failed write");
             }
         }
 
@@ -1788,7 +1788,7 @@ impl MultiPaxosReplica {
                     self.snap_offset = now_size;
                     Ok(())
                 } else {
-                    logged_err!(self.id; "unexpected log result type or failed truncate")
+                    logged_err!(self.id; "unexpected log result type or failed write")
                 }
             }
 
diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs
index 4d6b2408..4693b5cf 100644
--- a/src/protocols/raft.rs
+++ b/src/protocols/raft.rs
@@ -5,7 +5,7 @@
 //!   - <https://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf>
 //!   - <https://decentralizedthoughts.github.io/2020-12-12-raft-liveness-full-omission/>
 
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::path::Path;
 use std::net::SocketAddr;
 
@@ -78,7 +78,7 @@ impl Default for ReplicaConfigRaft {
             hb_hear_timeout_min: 600,
             hb_hear_timeout_max: 900,
             hb_send_interval_ms: 50,
-            snapshot_path: "/tmp/summerset.multipaxos.snap".into(),
+            snapshot_path: "/tmp/summerset.raft.snap".into(),
             snapshot_interval_s: 0,
             perf_storage_a: 0,
             perf_storage_b: 0,
@@ -108,6 +108,14 @@ struct LogEntry {
 
     /// Batch of client requests.
     reqs: ReqBatch,
+
+    /// True if from external client, else false.
+    external: bool,
+
+    /// Offset in durable log file of this entry. This field is not maintained
+    /// in durable storage itself, where it is typically 0. It is maintained
+    /// only in the in-memory log.
+    log_offset: usize,
 }
 
 /// Stable storage log entry type.
@@ -120,7 +128,7 @@ enum DurEntry {
     /// Durable metadata.
     Metadata {
         curr_term: Term,
-        voted_for: ReplicaId,
+        voted_for: Option<ReplicaId>,
     },
 
     /// Log entry mirroring in-mem log.
@@ -154,7 +162,12 @@ enum PeerMsg {
     },
 
     /// AppendEntries reply from follower to leader.
-    AppendEntriesReply { term: Term, success: bool },
+    AppendEntriesReply {
+        term: Term,
+        /// For correct tracking of which AppendEntries this reply is for.
+        end_slot: usize,
+        success: bool,
+    },
 
     /// RequestVote from leader to followers.
     RequestVote {
@@ -167,6 +180,16 @@ enum PeerMsg {
     RequestVoteReply { term: Term, granted: bool },
 }
 
+/// Replica role type.
+#[derive(
+    Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Serialize, Deserialize,
+)]
+enum Role {
+    Follower,
+    Candidate,
+    Leader,
+}
+
 /// Raft server replica module.
 pub struct RaftReplica {
     /// Replica ID in cluster.
@@ -205,6 +228,9 @@ pub struct RaftReplica {
     /// TransportHub module.
     transport_hub: TransportHub<PeerMsg>,
 
+    /// Which role am I in right now?
+    role: Role,
+
     /// Who do I think is the effective leader of the cluster right now?
     leader: Option<ReplicaId>,
 
@@ -224,20 +250,26 @@ pub struct RaftReplica {
     /// Latest term seen.
     curr_term: Term,
 
-    /// Candidate ID that received vote in current term.
-    voted_for: ReplicaId,
+    /// Candidate ID that I voted for in current term.
+    voted_for: Option<ReplicaId>,
+
+    /// Replica IDs that voted for me in current election.
+    votes_granted: HashSet<ReplicaId>,
 
-    /// In-memory log of entries.
+    /// In-memory log of entries. Slot 0 is a dummy entry to make indexing happy.
     log: Vec<LogEntry>,
 
-    /// Map from in-mem log entry slot index -> offset in durable backer file.
-    log_offset: Vec<usize>,
+    /// Start slot index of in-mem log after latest snapshot.
+    start_slot: usize,
+
+    /// Timer for taking a new autonomous snapshot.
+    snapshot_interval: Interval,
 
     /// Slot index of highest log entry known to be committed.
-    commit_bar: usize,
+    last_commit: usize,
 
     /// Slot index of highest log entry applied to state machine.
-    exec_bar: usize,
+    last_exec: usize,
 
     /// For each server, index of the next log entry to send.
     next_slot: HashMap<ReplicaId, usize>,
@@ -245,10 +277,1359 @@ pub struct RaftReplica {
     /// For each server, index of the highest log entry known to be replicated.
     match_slot: HashMap<ReplicaId, usize>,
 
+    /// Current durable log file end offset.
+    log_offset: usize,
+
     /// Current durable snapshot file offset.
     snap_offset: usize,
 }
 
+// RaftReplica common helpers
+impl RaftReplica {
+    /// Compose LogActionId from (slot, end_slot) pair & entry type.
+    /// Uses the `Role` enum type to represent differnet entry types.
+    #[inline]
+    fn make_log_action_id(
+        slot: usize,
+        slot_e: usize,
+        entry_type: Role,
+    ) -> LogActionId {
+        let type_num = match entry_type {
+            Role::Follower => 1,
+            Role::Leader => 2,
+            _ => panic!("unknown log entry type {:?}", entry_type),
+        };
+        ((slot << 33) | (slot_e << 2) | type_num) as LogActionId
+    }
+
+    /// Decompose LogActionId into (slot, end_slot) pair & entry type.
+    #[inline]
+    fn split_log_action_id(log_action_id: LogActionId) -> (usize, usize, Role) {
+        let slot = (log_action_id >> 33) as usize;
+        let slot_e = ((log_action_id & ((1 << 33) - 1)) >> 2) as usize;
+        let type_num = log_action_id & ((1 << 2) - 1);
+        let entry_type = match type_num {
+            1 => Role::Follower,
+            2 => Role::Leader,
+            _ => panic!("unknown log entry type num {}", type_num),
+        };
+        (slot, slot_e, entry_type)
+    }
+
+    /// Compose CommandId from slot index & command index within.
+    #[inline]
+    fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId {
+        assert!(slot <= (u32::MAX as usize));
+        assert!(cmd_idx <= (u32::MAX as usize));
+        ((slot << 32) | cmd_idx) as CommandId
+    }
+
+    /// Decompose CommandId into slot index & command index within.
+    #[inline]
+    fn split_command_id(command_id: CommandId) -> (usize, usize) {
+        let slot = (command_id >> 32) as usize;
+        let cmd_idx = (command_id & ((1 << 32) - 1)) as usize;
+        (slot, cmd_idx)
+    }
+
+    /// Check if the given term is larger than mine. If so, convert my role
+    /// back to follower. Returns true if my role was not follower but now
+    /// converted to follower, and false otherwise.
+    #[inline]
+    fn check_term(
+        &mut self,
+        peer: ReplicaId,
+        term: Term,
+    ) -> Result<bool, SummersetError> {
+        if term > self.curr_term {
+            self.curr_term = term;
+            self.heard_heartbeat(peer, term)?; // refresh election timer
+            if self.role != Role::Follower {
+                self.role = Role::Follower;
+                Ok(true)
+            } else {
+                Ok(false)
+            }
+        } else {
+            Ok(false)
+        }
+    }
+}
+
+// RaftReplica client requests entrance
+impl RaftReplica {
+    /// Handler of client request batch chan recv.
+    fn handle_req_batch(
+        &mut self,
+        req_batch: ReqBatch,
+    ) -> Result<(), SummersetError> {
+        let batch_size = req_batch.len();
+        assert!(batch_size > 0);
+        pf_debug!(self.id; "got request batch of size {}", batch_size);
+
+        // if I'm not a leader, ignore client requests
+        if self.role != Role::Leader {
+            for (client, req) in req_batch {
+                if let ApiRequest::Req { id: req_id, .. } = req {
+                    // tell the client to try on known leader or just the
+                    // next ID replica
+                    let target = if let Some(peer) = self.leader {
+                        peer
+                    } else {
+                        (self.id + 1) % self.population
+                    };
+                    self.external_api.send_reply(
+                        ApiReply::Reply {
+                            id: req_id,
+                            result: None,
+                            redirect: Some(target),
+                        },
+                        client,
+                    )?;
+                    pf_trace!(self.id; "redirected client {} to replica {}",
+                                       client, target);
+                }
+            }
+            return Ok(());
+        }
+
+        // append an entry to in-memory log
+        let entry = LogEntry {
+            term: self.curr_term,
+            reqs: req_batch,
+            external: true,
+            log_offset: self.log_offset,
+        };
+        let slot = self.start_slot + self.log.len();
+        self.log.push(entry.clone());
+
+        // submit logger action to make this log entry durable
+        self.storage_hub.submit_action(
+            Self::make_log_action_id(slot, slot, Role::Leader),
+            LogAction::Append {
+                entry: DurEntry::LogEntry { entry },
+                sync: self.config.logger_sync,
+            },
+        )?;
+        pf_trace!(self.id; "submitted leader append log action for slot {}", slot);
+
+        Ok(())
+    }
+}
+
+// RaftReplica durable logging
+impl RaftReplica {
+    /// Handler of leader append logging result chan recv.
+    fn handle_logged_leader_append(
+        &mut self,
+        slot: usize,
+        slot_e: usize,
+    ) -> Result<(), SummersetError> {
+        if slot < self.start_slot || self.role != Role::Leader {
+            return Ok(()); // ignore if outdated
+        }
+        pf_trace!(self.id; "finished leader append logging for slot {} <= {}",
+                           slot, slot_e);
+        assert_eq!(slot, slot_e);
+
+        // broadcast AppendEntries messages to followers
+        for peer in 0..self.population {
+            if peer == self.id {
+                continue;
+            }
+
+            let prev_slot = self.next_slot[&peer] - 1;
+            if prev_slot < self.start_slot {
+                pf_error!(self.id; "snapshotted slot {} queried", prev_slot);
+            }
+            let prev_term = self.log[prev_slot - self.start_slot].term;
+            let entries = self
+                .log
+                .iter()
+                .skip(self.next_slot[&peer] - self.start_slot)
+                .cloned()
+                .collect();
+
+            if slot >= self.next_slot[&peer] {
+                self.transport_hub.send_msg(
+                    PeerMsg::AppendEntries {
+                        term: self.curr_term,
+                        prev_slot,
+                        prev_term,
+                        entries,
+                        leader_commit: self.last_commit,
+                    },
+                    peer,
+                )?;
+                pf_trace!(self.id; "sent AppendEntries -> {} with slots {} - {}",
+                                   peer, self.next_slot[&peer],
+                                   self.start_slot + self.log.len() - 1);
+            }
+        }
+
+        // I also heard my own heartbeat
+        self.heard_heartbeat(self.id, self.curr_term)?;
+
+        Ok(())
+    }
+
+    /// Handler of follower append logging result chan recv.
+    fn handle_logged_follower_append(
+        &mut self,
+        slot: usize,
+        slot_e: usize,
+    ) -> Result<(), SummersetError> {
+        if slot < self.start_slot || self.role != Role::Follower {
+            return Ok(()); // ignore if outdated
+        }
+        pf_trace!(self.id; "finished follower append logging for slot {} <= {}",
+                           slot, slot_e);
+        assert!(slot <= slot_e);
+
+        // submit newly committed entry for state machine execution
+        if slot > self.last_exec && slot <= self.last_commit {
+            let entry = &self.log[slot - self.start_slot];
+            for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() {
+                if let ApiRequest::Req { cmd, .. } = req {
+                    self.state_machine.submit_cmd(
+                        Self::make_command_id(slot, cmd_idx),
+                        cmd.clone(),
+                    )?;
+                } else {
+                    continue; // ignore other types of requests
+                }
+            }
+        }
+
+        // if all consecutive entries are made durable, reply AppendEntries
+        // success back to leader
+        if slot == slot_e {
+            if let Some(leader) = self.leader {
+                self.transport_hub.send_msg(
+                    PeerMsg::AppendEntriesReply {
+                        term: self.curr_term,
+                        end_slot: slot_e,
+                        success: true,
+                    },
+                    leader,
+                )?;
+                pf_trace!(self.id; "sent AppendEntriesReply -> {} up to slot {}",
+                                   leader, slot_e);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Synthesized handler of durable logging result chan recv.
+    fn handle_log_result(
+        &mut self,
+        action_id: LogActionId,
+        log_result: LogResult<DurEntry>,
+    ) -> Result<(), SummersetError> {
+        let (slot, slot_e, entry_type) = Self::split_log_action_id(action_id);
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        assert!(slot_e < self.start_slot + self.log.len());
+
+        if let LogResult::Append { now_size } = log_result {
+            assert_eq!(
+                self.log[slot - self.start_slot].log_offset,
+                self.log_offset
+            );
+            assert!(now_size > self.log_offset);
+            self.log_offset = now_size;
+        } else {
+            return logged_err!(self.id; "unexpected log result type: {:?}", log_result);
+        }
+
+        match entry_type {
+            Role::Follower => self.handle_logged_follower_append(slot, slot_e),
+            Role::Leader => self.handle_logged_leader_append(slot, slot_e),
+            _ => {
+                logged_err!(self.id; "unexpected log entry type: {:?}", entry_type)
+            }
+        }
+    }
+}
+
+// RaftReplica peer-peer messages handling
+impl RaftReplica {
+    /// Handler of AppendEntries message from leader.
+    async fn handle_msg_append_entries(
+        &mut self,
+        leader: ReplicaId,
+        term: Term,
+        prev_slot: usize,
+        prev_term: Term,
+        mut entries: Vec<LogEntry>,
+        leader_commit: usize,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received AcceptEntries <- {} for slot > {} term {}",
+                           leader, prev_slot, term);
+        if self.check_term(leader, term)? || self.role != Role::Follower {
+            return Ok(());
+        }
+
+        // reply false if term smaller than mine, or if my log does not
+        // contain an entry at prev_slot matching prev_term
+        if term < self.curr_term
+            || prev_slot < self.start_slot
+            || prev_slot >= self.start_slot + self.log.len()
+            || self.log[prev_slot - self.start_slot].term != prev_term
+        {
+            self.transport_hub.send_msg(
+                PeerMsg::AppendEntriesReply {
+                    term: self.curr_term,
+                    end_slot: prev_slot,
+                    success: false,
+                },
+                leader,
+            )?;
+
+            if term >= self.curr_term {
+                // also refresh heartbeat timer here since the "decrementing"
+                // procedure for a lagging follower might take long
+                self.heard_heartbeat(leader, term)?;
+            }
+            return Ok(());
+        }
+
+        // update my knowledge of who's the current leader, and reset election
+        // timeout timer
+        self.leader = Some(leader);
+        self.heard_heartbeat(leader, term)?;
+
+        // check if any existing entry conflicts with a new one in `entries`.
+        // If so, truncate everything at and after that entry
+        let mut first_new = prev_slot + 1;
+        for (slot, new_entry) in entries
+            .iter()
+            .enumerate()
+            .map(|(s, e)| (s + prev_slot + 1, e))
+        {
+            if slot >= self.start_slot + self.log.len() {
+                first_new = slot;
+                break;
+            } else if self.log[slot - self.start_slot].term != new_entry.term {
+                let cut_offset = self.log[slot - self.start_slot].log_offset;
+                // do this truncation in-place for simplicity
+                self.storage_hub.submit_action(
+                    0,
+                    LogAction::Truncate { offset: cut_offset },
+                )?;
+                loop {
+                    let (action_id, log_result) =
+                        self.storage_hub.get_result().await?;
+                    if action_id != 0 {
+                        // normal log action previously in queue; process it
+                        self.handle_log_result(action_id, log_result)?;
+                    } else {
+                        if let LogResult::Truncate {
+                            offset_ok: true,
+                            now_size,
+                        } = log_result
+                        {
+                            assert_eq!(now_size, cut_offset);
+                            self.log_offset = cut_offset;
+                        } else {
+                            return logged_err!(
+                                self.id;
+                                "unexpected log result type or failed truncate"
+                            );
+                        }
+                        break;
+                    }
+                }
+                // truncate in-mem log as well
+                self.log.truncate(slot - self.start_slot);
+                first_new = slot;
+                break;
+            }
+        }
+
+        // append new entries into my log, and submit logger actions to make
+        // new entries durable
+        let (num_entries, mut num_appended) = (0, 0);
+        for (slot, mut entry) in entries
+            .drain((first_new - prev_slot - 1)..entries.len())
+            .enumerate()
+            .map(|(s, e)| (s + first_new, e))
+        {
+            entry.external = false; // not from client
+            self.log.push(entry.clone());
+            self.storage_hub.submit_action(
+                Self::make_log_action_id(
+                    slot,
+                    prev_slot + num_entries,
+                    Role::Follower,
+                ),
+                LogAction::Append {
+                    entry: DurEntry::LogEntry { entry },
+                    sync: self.config.logger_sync,
+                },
+            )?;
+            num_appended += 1;
+        }
+
+        // even if no entries appended, also send back AppendEntriesReply
+        // as a follower-to-leader reverse heardbeat for peer health
+        // tracking purposes
+        if num_appended == 0 {
+            self.transport_hub.send_msg(
+                PeerMsg::AppendEntriesReply {
+                    term: self.curr_term,
+                    end_slot: first_new - 1,
+                    success: true,
+                },
+                leader,
+            )?;
+        }
+
+        // if leader_commit is larger than my last_commit, update last_commit
+        if leader_commit > self.last_commit {
+            self.last_commit = if leader_commit < prev_slot + entries.len() {
+                leader_commit
+            } else {
+                prev_slot + entries.len()
+            };
+        }
+
+        Ok(())
+    }
+
+    /// Handler of AppendEntries reply from follower.
+    fn handle_msg_append_entries_reply(
+        &mut self,
+        peer: ReplicaId,
+        term: Term,
+        end_slot: usize,
+        success: bool,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received AcceptEntriesReply <- {} for term {} {}",
+                           peer, term, if success { "ok" } else { "fail" });
+        if self.check_term(peer, term)? || self.role != Role::Leader {
+            return Ok(());
+        }
+
+        if success {
+            // success: update next_slot and match_slot for follower
+            *self.next_slot.get_mut(&peer).unwrap() = end_slot + 1;
+            *self.match_slot.get_mut(&peer).unwrap() = end_slot;
+
+            // since we updated some match_slot here, check if any additional
+            // entries are now considered committed
+            for slot in
+                (self.last_commit + 1)..(self.start_slot + self.log.len())
+            {
+                let entry = &self.log[slot - self.start_slot];
+                if entry.term != self.curr_term {
+                    continue; // cannot decide commit using non-latest term
+                }
+
+                let match_cnt = 1 + self
+                    .match_slot
+                    .values()
+                    .filter(|&&s| s >= slot)
+                    .count() as u8;
+                if match_cnt >= self.quorum_cnt {
+                    // quorum size reached, set last_commit to here
+                    self.last_commit = slot;
+                }
+            }
+
+            // submit newly committed commands, if any, for execution
+            for slot in (self.last_exec + 1)..=self.last_commit {
+                let entry = &self.log[slot - self.start_slot];
+                for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() {
+                    if let ApiRequest::Req { cmd, .. } = req {
+                        self.state_machine.submit_cmd(
+                            Self::make_command_id(slot, cmd_idx),
+                            cmd.clone(),
+                        )?;
+                    } else {
+                        continue; // ignore other types of requests
+                    }
+                }
+            }
+        } else {
+            // failed: decrement next_slot for follower and retry
+            *self.next_slot.get_mut(&peer).unwrap() -= 1;
+
+            let prev_slot = self.next_slot[&peer] - 1;
+            if prev_slot < self.start_slot {
+                pf_error!(self.id; "snapshotted slot {} queried", prev_slot);
+            }
+            let prev_term = self.log[prev_slot - self.start_slot].term;
+            let entries = self
+                .log
+                .iter()
+                .skip(self.next_slot[&peer] - self.start_slot)
+                .cloned()
+                .collect();
+
+            self.transport_hub.send_msg(
+                PeerMsg::AppendEntries {
+                    term: self.curr_term,
+                    prev_slot,
+                    prev_term,
+                    entries,
+                    leader_commit: self.last_commit,
+                },
+                peer,
+            )?;
+            pf_trace!(self.id; "sent AppendEntries -> {} with slots {} - {}",
+                               peer, self.next_slot[&peer],
+                               self.start_slot + self.log.len() - 1);
+        }
+
+        Ok(())
+    }
+
+    /// Handler of RequestVote message from candidate.
+    fn handle_msg_request_vote(
+        &mut self,
+        candidate: ReplicaId,
+        term: Term,
+        last_slot: usize,
+        last_term: Term,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received RequestVote <- {} with term {} last {} term {}",
+                           candidate, term, last_slot, last_term);
+        self.check_term(candidate, term)?;
+
+        // if the given term is smaller than mine, reply false
+        if term < self.curr_term {
+            self.transport_hub.send_msg(
+                PeerMsg::RequestVoteReply {
+                    term: self.curr_term,
+                    granted: false,
+                },
+                candidate,
+            )?;
+            pf_trace!(self.id; "sent RequestVote -> {} term {} false",
+                               candidate, self.curr_term);
+            return Ok(());
+        }
+
+        // if I did not vote for anyone else in my current term and that the
+        // candidate's log is as up-to-date as mine, grant vote
+        #[allow(clippy::collapsible_if)]
+        if self.voted_for.is_none() || (self.voted_for.unwrap() == candidate) {
+            if last_term >= self.log.last().unwrap().term
+                || (last_term == self.curr_term
+                    && last_slot + 1 >= self.start_slot + self.log.len())
+            {
+                self.transport_hub.send_msg(
+                    PeerMsg::RequestVoteReply {
+                        term: self.curr_term,
+                        granted: true,
+                    },
+                    candidate,
+                )?;
+                pf_trace!(self.id; "sent RequestVote -> {} term {} granted",
+                               candidate, self.curr_term);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Handler of RequestVote reply from peer.
+    fn handle_msg_request_vote_reply(
+        &mut self,
+        peer: ReplicaId,
+        term: Term,
+        granted: bool,
+    ) -> Result<(), SummersetError> {
+        pf_trace!(self.id; "received RequestVoteReply <- {} with term {} {}",
+                           peer, term, if granted { "granted" } else { "false" });
+        if self.check_term(peer, term)? || self.role != Role::Candidate {
+            return Ok(());
+        }
+
+        // bookkeep this vote
+        self.votes_granted.insert(peer);
+
+        // if a majority of servers have voted for me, become the leader
+        if self.votes_granted.len() as u8 >= self.quorum_cnt {
+            self.become_the_leader()?;
+        }
+
+        Ok(())
+    }
+
+    /// Synthesized handler of receiving message from peer.
+    async fn handle_msg_recv(
+        &mut self,
+        peer: ReplicaId,
+        msg: PeerMsg,
+    ) -> Result<(), SummersetError> {
+        match msg {
+            PeerMsg::AppendEntries {
+                term,
+                prev_slot,
+                prev_term,
+                entries,
+                leader_commit,
+            } => {
+                self.handle_msg_append_entries(
+                    peer,
+                    term,
+                    prev_slot,
+                    prev_term,
+                    entries,
+                    leader_commit,
+                )
+                .await
+            }
+            PeerMsg::AppendEntriesReply {
+                term,
+                end_slot,
+                success,
+            } => self
+                .handle_msg_append_entries_reply(peer, term, end_slot, success),
+            PeerMsg::RequestVote {
+                term,
+                last_slot,
+                last_term,
+            } => self.handle_msg_request_vote(peer, term, last_slot, last_term),
+            PeerMsg::RequestVoteReply { term, granted } => {
+                self.handle_msg_request_vote_reply(peer, term, granted)
+            }
+        }
+    }
+}
+
+// RaftReplica state machine execution
+impl RaftReplica {
+    /// Handler of state machine exec result chan recv.
+    fn handle_cmd_result(
+        &mut self,
+        cmd_id: CommandId,
+        cmd_result: CommandResult,
+    ) -> Result<(), SummersetError> {
+        let (slot, cmd_idx) = Self::split_command_id(cmd_id);
+        if slot < self.start_slot {
+            return Ok(()); // ignore if slot index outdated
+        }
+        assert!(slot < self.start_slot + self.log.len());
+        pf_trace!(self.id; "executed cmd in entry at slot {} idx {}",
+                           slot, cmd_idx);
+
+        let entry = &mut self.log[slot - self.start_slot];
+        assert!(cmd_idx < entry.reqs.len());
+        let (client, ref req) = entry.reqs[cmd_idx];
+
+        // reply command result back to client
+        if let ApiRequest::Req { id: req_id, .. } = req {
+            if entry.external && self.external_api.has_client(client) {
+                self.external_api.send_reply(
+                    ApiReply::Reply {
+                        id: *req_id,
+                        result: Some(cmd_result),
+                        redirect: None,
+                    },
+                    client,
+                )?;
+                pf_trace!(self.id; "replied -> client {} for slot {} idx {}",
+                                   client, slot, cmd_idx);
+            }
+        } else {
+            return logged_err!(self.id; "unexpected API request type");
+        }
+
+        // if all commands in this entry have been executed, update last_exec
+        if cmd_idx == entry.reqs.len() - 1 {
+            pf_debug!(self.id; "executed all cmds in entry at slot {}", slot);
+            self.last_exec = slot;
+        }
+
+        Ok(())
+    }
+}
+
+// RaftReplica leader election timeout logic
+impl RaftReplica {
+    /// Becomes a candidate and starts the election procedure.
+    async fn become_a_candidate(&mut self) -> Result<(), SummersetError> {
+        if self.role != Role::Follower {
+            return Ok(());
+        } else if let Some(peer) = self.leader {
+            // mark old leader as dead
+            if self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, false)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
+        }
+
+        self.role = Role::Candidate;
+
+        // increment current term and vote for myself
+        self.curr_term += 1;
+        self.voted_for = Some(self.id);
+        self.votes_granted = HashSet::from([self.id]);
+        pf_info!(self.id; "starting election with term {}...", self.curr_term);
+
+        // also make the two critical fields durable, synchronously
+        self.storage_hub.submit_action(
+            0,
+            LogAction::Write {
+                entry: DurEntry::Metadata {
+                    curr_term: self.curr_term,
+                    voted_for: self.voted_for,
+                },
+                offset: 0,
+                sync: self.config.logger_sync,
+            },
+        )?;
+        loop {
+            let (action_id, log_result) = self.storage_hub.get_result().await?;
+            if action_id != 0 {
+                // normal log action previously in queue; process it
+                self.handle_log_result(action_id, log_result)?;
+            } else {
+                if let LogResult::Write {
+                    offset_ok: true, ..
+                } = log_result
+                {
+                } else {
+                    return logged_err!(self.id; "unexpected log result type or failed write");
+                }
+                break;
+            }
+        }
+
+        // reset election timeout timer
+        self.heard_heartbeat(self.id, self.curr_term)?;
+
+        // send RequestVote messages to all other peers
+        let last_slot = self.start_slot + self.log.len() - 1;
+        assert!(last_slot >= self.start_slot);
+        let last_term = self.log[last_slot - self.start_slot].term;
+        self.transport_hub.bcast_msg(
+            PeerMsg::RequestVote {
+                term: self.curr_term,
+                last_slot,
+                last_term,
+            },
+            None,
+        )?;
+        pf_trace!(self.id; "broadcast RequestVote with term {} last {} term {}",
+                           self.curr_term, last_slot, last_term);
+
+        Ok(())
+    }
+
+    /// Becomes the leader after enough votes granted for me.
+    fn become_the_leader(&mut self) -> Result<(), SummersetError> {
+        pf_info!(self.id; "elected as leader with term {}", self.curr_term);
+
+        // clear peers' heartbeat reply counters, and broadcast a heartbeat now
+        for cnts in self.hb_reply_cnts.values_mut() {
+            *cnts = (1, 0, 0);
+        }
+        self.bcast_heartbeats()?;
+
+        // re-initialize next_slot and match_slot information
+        for slot in self.next_slot.values_mut() {
+            *slot = self.start_slot + self.log.len();
+        }
+        for slot in self.match_slot.values_mut() {
+            *slot = 0;
+        }
+
+        Ok(())
+    }
+
+    /// Broadcasts empty AppendEntries messages as heartbeats to all peers.
+    fn bcast_heartbeats(&mut self) -> Result<(), SummersetError> {
+        let prev_slot = self.start_slot + self.log.len() - 1;
+        assert!(prev_slot >= self.start_slot);
+        let prev_term = self.log[prev_slot - self.start_slot].term;
+        self.transport_hub.bcast_msg(
+            PeerMsg::AppendEntries {
+                term: self.curr_term,
+                prev_slot,
+                prev_term,
+                entries: vec![],
+                leader_commit: self.last_commit,
+            },
+            None,
+        )?;
+
+        // update max heartbeat reply counters and their repetitions seen
+        for (&peer, cnts) in self.hb_reply_cnts.iter_mut() {
+            if cnts.0 > cnts.1 {
+                // more hb replies have been received from this peer; it is
+                // probably alive
+                cnts.1 = cnts.0;
+                cnts.2 = 0;
+            } else {
+                // did not receive hb reply from this peer at least for the
+                // last sent hb from me; increment repetition count
+                cnts.2 += 1;
+                let repeat_threshold = (self.config.hb_hear_timeout_min
+                    / self.config.hb_send_interval_ms)
+                    as u8;
+                if cnts.2 > repeat_threshold {
+                    // did not receive hb reply from this peer for too many
+                    // past hbs sent from me; this peer is probably dead
+                    if self.peer_alive.get(peer)? {
+                        self.peer_alive.set(peer, false)?;
+                        pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+                    }
+                }
+            }
+        }
+
+        // I also heard this heartbeat from myself
+        self.heard_heartbeat(self.id, self.curr_term)?;
+
+        // pf_trace!(self.id; "broadcast heartbeats term {}", self.curr_term);
+        Ok(())
+    }
+
+    /// Chooses a random hb_hear_timeout from the min-max range and kicks off
+    /// the hb_hear_timer.
+    fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> {
+        let timeout_ms = thread_rng().gen_range(
+            self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max,
+        );
+
+        // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms);
+        self.hb_hear_timer
+            .kickoff(Duration::from_millis(timeout_ms))?;
+        Ok(())
+    }
+
+    /// Heard a heartbeat from some other replica. Resets election timer.
+    fn heard_heartbeat(
+        &mut self,
+        peer: ReplicaId,
+        _term: Term,
+    ) -> Result<(), SummersetError> {
+        if peer != self.id {
+            self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1;
+            if !self.peer_alive.get(peer)? {
+                self.peer_alive.set(peer, true)?;
+                pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive);
+            }
+        }
+
+        // reset hearing timer
+        self.kickoff_hb_hear_timer()?;
+
+        // pf_trace!(self.id; "heard heartbeat <- {} term {}", peer, term);
+        Ok(())
+    }
+}
+
+// RaftReplica control messages handling
+impl RaftReplica {
+    /// Handler of ResetState control message.
+    async fn handle_ctrl_reset_state(
+        &mut self,
+        durable: bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got restart req");
+
+        // send leave notification to peers and wait for their replies
+        self.transport_hub.leave().await?;
+
+        // send leave notification to manager and wait for its reply
+        self.control_hub.send_ctrl(CtrlMsg::Leave)?;
+        while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {}
+
+        // if `durable` is false, truncate backer file
+        if !durable {
+            // use 0 as a special log action ID here
+            self.storage_hub
+                .submit_action(0, LogAction::Truncate { offset: 0 })?;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id == 0 {
+                    if log_result
+                        != (LogResult::Truncate {
+                            offset_ok: true,
+                            now_size: 0,
+                        })
+                    {
+                        return logged_err!(self.id; "failed to truncate log to 0");
+                    } else {
+                        return Ok(());
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Handler of Pause control message.
+    fn handle_ctrl_pause(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got pause req");
+        *paused = true;
+        self.control_hub.send_ctrl(CtrlMsg::PauseReply)?;
+        Ok(())
+    }
+
+    /// Handler of Resume control message.
+    fn handle_ctrl_resume(
+        &mut self,
+        paused: &mut bool,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server got resume req");
+
+        // reset leader heartbeat timer
+        self.hb_hear_timer.cancel()?;
+        self.kickoff_hb_hear_timer()?;
+
+        *paused = false;
+        self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?;
+        Ok(())
+    }
+
+    /// Handler of TakeSnapshot control message.
+    async fn handle_ctrl_take_snapshot(
+        &mut self,
+    ) -> Result<(), SummersetError> {
+        pf_warn!(self.id; "server told to take snapshot");
+        self.take_new_snapshot().await?;
+
+        self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
+            new_start: self.start_slot,
+        })?;
+        Ok(())
+    }
+
+    /// Synthesized handler of manager control messages. If ok, returns
+    /// `Some(true)` if decides to terminate and reboot, `Some(false)` if
+    /// decides to shutdown completely, and `None` if not terminating.
+    async fn handle_ctrl_msg(
+        &mut self,
+        msg: CtrlMsg,
+        paused: &mut bool,
+    ) -> Result<Option<bool>, SummersetError> {
+        match msg {
+            CtrlMsg::ResetState { durable } => {
+                self.handle_ctrl_reset_state(durable).await?;
+                Ok(Some(true))
+            }
+
+            CtrlMsg::Pause => {
+                self.handle_ctrl_pause(paused)?;
+                Ok(None)
+            }
+
+            CtrlMsg::Resume => {
+                self.handle_ctrl_resume(paused)?;
+                Ok(None)
+            }
+
+            CtrlMsg::TakeSnapshot => {
+                self.handle_ctrl_take_snapshot().await?;
+                Ok(None)
+            }
+
+            _ => Ok(None), // ignore all other types
+        }
+    }
+}
+
+// RaftReplica recovery from durable log
+impl RaftReplica {
+    /// Recover state from durable storage log.
+    async fn recover_from_log(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.log_offset, 0);
+
+        // first, try to read the first several bytes, which should record
+        // necessary durable metadata
+        self.storage_hub
+            .submit_action(0, LogAction::Read { offset: 0 })?;
+        let (_, log_result) = self.storage_hub.get_result().await?;
+
+        match log_result {
+            LogResult::Read {
+                entry:
+                    Some(DurEntry::Metadata {
+                        curr_term,
+                        voted_for,
+                    }),
+                end_offset,
+            } => {
+                self.log_offset = end_offset;
+
+                // recover necessary metadata info
+                self.curr_term = curr_term;
+                self.voted_for = voted_for;
+
+                // read out and push all log entries into memory log
+                loop {
+                    // using 0 as a special log action ID
+                    self.storage_hub.submit_action(
+                        0,
+                        LogAction::Read {
+                            offset: self.log_offset,
+                        },
+                    )?;
+                    let (_, log_result) = self.storage_hub.get_result().await?;
+
+                    match log_result {
+                        LogResult::Read {
+                            entry: Some(DurEntry::LogEntry { mut entry }),
+                            end_offset,
+                        } => {
+                            entry.log_offset = self.log_offset;
+                            self.log.push(entry);
+                            // update log offset
+                            self.log_offset = end_offset;
+                        }
+                        LogResult::Read { entry: None, .. } => {
+                            // end of log reached
+                            break;
+                        }
+                        _ => {
+                            return logged_err!(self.id; "unexpected log result type");
+                        }
+                    }
+                }
+            }
+
+            LogResult::Read { entry: None, .. } => {
+                // log file is empty, write initial metadata
+                self.storage_hub.submit_action(
+                    0,
+                    LogAction::Write {
+                        entry: DurEntry::Metadata {
+                            curr_term: 0,
+                            voted_for: None,
+                        },
+                        offset: 0,
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                let (_, log_result) = self.storage_hub.get_result().await?;
+                if let LogResult::Write {
+                    offset_ok: true,
+                    now_size,
+                } = log_result
+                {
+                    self.log_offset = now_size;
+                } else {
+                    return logged_err!(self.id; "unexpected log result type or failed write");
+                }
+                // ... and write the 0-th dummy entry
+                self.storage_hub.submit_action(
+                    0,
+                    LogAction::Write {
+                        entry: DurEntry::LogEntry {
+                            entry: LogEntry {
+                                term: 0,
+                                reqs: vec![],
+                                external: false,
+                                log_offset: self.log_offset,
+                            },
+                        },
+                        offset: self.log_offset,
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                let (_, log_result) = self.storage_hub.get_result().await?;
+                if let LogResult::Write {
+                    offset_ok: true,
+                    now_size,
+                } = log_result
+                {
+                    self.log[0].log_offset = self.log_offset;
+                    self.log_offset = now_size;
+                } else {
+                    return logged_err!(self.id; "unexpected log result type or failed write");
+                }
+            }
+
+            _ => return logged_err!(self.id; "unexpected log result type"),
+        }
+
+        // do an extra Truncate to remove paritial entry at the end if any
+        self.storage_hub.submit_action(
+            0,
+            LogAction::Truncate {
+                offset: self.log_offset,
+            },
+        )?;
+        let (_, log_result) = self.storage_hub.get_result().await?;
+        if let LogResult::Truncate {
+            offset_ok: true, ..
+        } = log_result
+        {
+            if self.log_offset > 0 {
+                pf_info!(self.id; "recovered from wal log: term {} voted {:?} |log| {}",
+                                  self.curr_term, self.voted_for, self.log.len());
+            }
+            Ok(())
+        } else {
+            logged_err!(self.id; "unexpected log result type or failed truncate")
+        }
+    }
+}
+
+// RaftReplica snapshotting & GC logic
+impl RaftReplica {
+    /// Dump new key-value pairs to snapshot file.
+    async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
+        // collect all key-value pairs put up to exec_bar
+        let mut pairs = HashMap::new();
+        for slot in self.start_slot..self.last_exec {
+            let entry = &self.log[slot - self.start_slot];
+            for (_, req) in entry.reqs.clone() {
+                if let ApiRequest::Req {
+                    cmd: Command::Put { key, value },
+                    ..
+                } = req
+                {
+                    pairs.insert(key, value);
+                }
+            }
+        }
+
+        // write the collection to snapshot file
+        self.snapshot_hub.submit_action(
+            0, // using 0 as dummy log action ID
+            LogAction::Append {
+                entry: SnapEntry::KVPairSet { pairs },
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        if let LogResult::Append { now_size } = log_result {
+            self.snap_offset = now_size;
+            Ok(())
+        } else {
+            logged_err!(
+                self.id;
+                "unexpected log result type"
+            )
+        }
+    }
+
+    /// Discard everything lower than start_slot in durable log.
+    async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> {
+        assert!(!self.log.is_empty());
+        let cut_offset = self.log[0].log_offset;
+
+        // discard the log before cut_offset
+        if cut_offset > 0 {
+            self.storage_hub
+                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
+            loop {
+                let (action_id, log_result) =
+                    self.storage_hub.get_result().await?;
+                if action_id != 0 {
+                    // normal log action previously in queue; process it
+                    self.handle_log_result(action_id, log_result)?;
+                } else {
+                    if let LogResult::Discard {
+                        offset_ok: true,
+                        now_size,
+                    } = log_result
+                    {
+                        assert_eq!(self.log_offset - cut_offset, now_size);
+                        self.log_offset = now_size;
+                    } else {
+                        return logged_err!(
+                            self.id;
+                            "unexpected log result type or failed discard"
+                        );
+                    }
+                    break;
+                }
+            }
+        }
+
+        // update entry.log_offset for all remaining in-mem entries
+        for entry in &mut self.log {
+            if entry.log_offset > 0 {
+                assert!(entry.log_offset >= cut_offset);
+                entry.log_offset -= cut_offset;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Take a snapshot up to current last_exec, then discard the in-mem log up
+    /// to that index as well as their data in the durable log file.
+    ///
+    /// NOTE: the current implementation does not guard against crashes in the
+    /// middle of taking a snapshot. Production quality implementations should
+    /// make the snapshotting action "atomic".
+    ///
+    /// NOTE: the current implementation does not take care of InstallSnapshot
+    /// messages (which is needed when some lagging follower has some slot
+    /// which all other peers have snapshotted); we assume here that failed
+    /// Accept messages will be retried indefinitely until success before its
+    /// associated data gets discarded from leader's memory.
+    async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
+        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
+                           self.start_slot, self.last_exec);
+        assert!(self.last_exec + 1 >= self.start_slot);
+        if self.last_exec < self.start_slot + 1 {
+            // always keep at least one entry in log to make indexing happy
+            return Ok(());
+        }
+
+        // collect and dump all Puts in executed entries
+        if self.role == Role::Leader {
+            // NOTE: broadcast heartbeats here to appease followers
+            self.bcast_heartbeats()?;
+        }
+        self.snapshot_dump_kv_pairs().await?;
+
+        // write new slot info entry to the head of snapshot
+        self.snapshot_hub.submit_action(
+            0,
+            LogAction::Write {
+                entry: SnapEntry::SlotInfo {
+                    start_slot: self.last_exec,
+                },
+                offset: 0,
+                sync: self.config.logger_sync,
+            },
+        )?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+        match log_result {
+            LogResult::Write {
+                offset_ok: true, ..
+            } => {}
+            _ => {
+                return logged_err!(self.id; "unexpected log result type or failed write");
+            }
+        }
+
+        // update start_slot and discard all in-mem log entries up to
+        // last_exec - 1
+        self.log.drain(0..(self.last_exec - self.start_slot));
+        self.start_slot = self.last_exec;
+
+        // discarding everything lower than start_slot in durable log
+        if self.role == Role::Leader {
+            // NOTE: broadcast heartbeats here to appease followers
+            self.bcast_heartbeats()?;
+        }
+        self.snapshot_discard_log().await?;
+
+        // reset the leader heartbeat hear timer
+        self.kickoff_hb_hear_timer()?;
+
+        pf_info!(self.id; "took snapshot up to: start {}", self.start_slot);
+        Ok(())
+    }
+
+    /// Recover initial state from durable storage snapshot file.
+    async fn recover_from_snapshot(&mut self) -> Result<(), SummersetError> {
+        assert_eq!(self.snap_offset, 0);
+
+        // first, try to read the first several bytes, which should record the
+        // start_slot index
+        self.snapshot_hub
+            .submit_action(0, LogAction::Read { offset: 0 })?;
+        let (_, log_result) = self.snapshot_hub.get_result().await?;
+
+        match log_result {
+            LogResult::Read {
+                entry: Some(SnapEntry::SlotInfo { start_slot }),
+                end_offset,
+            } => {
+                self.snap_offset = end_offset;
+
+                // recover start_slot info
+                self.start_slot = start_slot;
+
+                // repeatedly apply key-value pairs
+                loop {
+                    self.snapshot_hub.submit_action(
+                        0,
+                        LogAction::Read {
+                            offset: self.snap_offset,
+                        },
+                    )?;
+                    let (_, log_result) =
+                        self.snapshot_hub.get_result().await?;
+
+                    match log_result {
+                        LogResult::Read {
+                            entry: Some(SnapEntry::KVPairSet { pairs }),
+                            end_offset,
+                        } => {
+                            // execute Put commands on state machine
+                            for (key, value) in pairs {
+                                self.state_machine.submit_cmd(
+                                    0,
+                                    Command::Put { key, value },
+                                )?;
+                                let _ = self.state_machine.get_result().await?;
+                            }
+                            // update snapshot file offset
+                            self.snap_offset = end_offset;
+                        }
+                        LogResult::Read { entry: None, .. } => {
+                            // end of log reached
+                            break;
+                        }
+                        _ => {
+                            return logged_err!(self.id; "unexpected log result type");
+                        }
+                    }
+                }
+
+                // tell manager about my start_slot index
+                self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo {
+                    new_start: self.start_slot,
+                })?;
+
+                if self.start_slot > 0 {
+                    pf_info!(self.id; "recovered from snapshot: start {}",
+                                      self.start_slot);
+                }
+                Ok(())
+            }
+
+            LogResult::Read { entry: None, .. } => {
+                // snapshot file is empty. Write a 1 as start_slot and return
+                self.snapshot_hub.submit_action(
+                    0,
+                    LogAction::Write {
+                        entry: SnapEntry::SlotInfo { start_slot: 1 },
+                        offset: 0,
+                        sync: self.config.logger_sync,
+                    },
+                )?;
+                let (_, log_result) = self.snapshot_hub.get_result().await?;
+                if let LogResult::Write {
+                    offset_ok: true,
+                    now_size,
+                } = log_result
+                {
+                    self.snap_offset = now_size;
+                    Ok(())
+                } else {
+                    logged_err!(self.id; "unexpected log result type or failed write")
+                }
+            }
+
+            _ => {
+                logged_err!(self.id; "unexpected log result type")
+            }
+        }
+    }
+}
+
 #[async_trait]
 impl GenericReplica for RaftReplica {
     async fn new_and_setup(
@@ -257,6 +1638,135 @@ impl GenericReplica for RaftReplica {
         manager: SocketAddr,
         config_str: Option<&str>,
     ) -> Result<Self, SummersetError> {
+        // connect to the cluster manager and get assigned a server ID
+        let mut control_hub = ControlHub::new_and_setup(manager).await?;
+        let id = control_hub.me;
+        let population = control_hub.population;
+
+        // parse protocol-specific configs
+        let config = parsed_config!(config_str => ReplicaConfigRaft;
+                                    batch_interval_ms, max_batch_size,
+                                    backer_path, logger_sync,
+                                    hb_hear_timeout_min, hb_hear_timeout_max,
+                                    hb_send_interval_ms,
+                                    snapshot_path, snapshot_interval_s,
+                                    perf_storage_a, perf_storage_b,
+                                    perf_network_a, perf_network_b)?;
+        if config.batch_interval_ms == 0 {
+            return logged_err!(
+                id;
+                "invalid config.batch_interval_ms '{}'",
+                config.batch_interval_ms
+            );
+        }
+        if config.hb_hear_timeout_min < 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_min '{}'",
+                config.hb_hear_timeout_min
+            );
+        }
+        if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 {
+            return logged_err!(
+                id;
+                "invalid config.hb_hear_timeout_max '{}'",
+                config.hb_hear_timeout_max
+            );
+        }
+        if config.hb_send_interval_ms == 0 {
+            return logged_err!(
+                id;
+                "invalid config.hb_send_interval_ms '{}'",
+                config.hb_send_interval_ms
+            );
+        }
+
+        // setup state machine module
+        let state_machine = StateMachine::new_and_setup(id).await?;
+
+        // setup storage hub module
+        let storage_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.backer_path),
+            if config.perf_storage_a == 0 && config.perf_storage_b == 0 {
+                None
+            } else {
+                Some((config.perf_storage_a, config.perf_storage_b))
+            },
+        )
+        .await?;
+
+        // setup transport hub module
+        let mut transport_hub = TransportHub::new_and_setup(
+            id,
+            population,
+            p2p_addr,
+            if config.perf_network_a == 0 && config.perf_network_b == 0 {
+                None
+            } else {
+                Some((config.perf_network_a, config.perf_network_b))
+            },
+        )
+        .await?;
+
+        // ask for the list of peers to proactively connect to. Do this after
+        // transport hub has been set up, so that I will be able to accept
+        // later peer connections
+        control_hub.send_ctrl(CtrlMsg::NewServerJoin {
+            id,
+            protocol: SmrProtocol::Raft,
+            api_addr,
+            p2p_addr,
+        })?;
+        let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } =
+            control_hub.recv_ctrl().await?
+        {
+            to_peers
+        } else {
+            return logged_err!(id; "unexpected ctrl msg type received");
+        };
+
+        // proactively connect to some peers, then wait for all population
+        // have been connected with me
+        for (peer, addr) in to_peers {
+            transport_hub.connect_to_peer(peer, addr).await?;
+        }
+        transport_hub.wait_for_group(population).await?;
+
+        // setup snapshot hub module
+        let snapshot_hub = StorageHub::new_and_setup(
+            id,
+            Path::new(&config.snapshot_path),
+            None,
+        )
+        .await?;
+
+        // setup external API module, ready to take in client requests
+        let external_api = ExternalApi::new_and_setup(
+            id,
+            api_addr,
+            Duration::from_millis(config.batch_interval_ms),
+            config.max_batch_size,
+        )
+        .await?;
+
+        let mut hb_send_interval =
+            time::interval(Duration::from_millis(config.hb_send_interval_ms));
+        hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
+        let mut snapshot_interval = time::interval(Duration::from_secs(
+            if config.snapshot_interval_s > 0 {
+                config.snapshot_interval_s
+            } else {
+                60 // dummy non-zero value to make `time::interval` happy
+            },
+        ));
+        snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
+        let hb_reply_cnts = (0..population)
+            .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) })
+            .collect();
+
         Ok(RaftReplica {
             id,
             population,
@@ -264,6 +1774,39 @@ impl GenericReplica for RaftReplica {
             config,
             _api_addr: api_addr,
             _p2p_addr: p2p_addr,
+            control_hub,
+            external_api,
+            state_machine,
+            storage_hub,
+            snapshot_hub,
+            transport_hub,
+            role: Role::Follower,
+            leader: None,
+            hb_hear_timer: Timer::new(),
+            hb_send_interval,
+            hb_reply_cnts,
+            peer_alive: Bitmap::new(population, true),
+            curr_term: 0,
+            voted_for: None,
+            votes_granted: HashSet::new(),
+            log: vec![LogEntry {
+                term: 0,
+                reqs: vec![],
+                external: false,
+                log_offset: 0,
+            }],
+            start_slot: 0,
+            snapshot_interval,
+            last_commit: 0,
+            last_exec: 0,
+            next_slot: (0..population)
+                .filter_map(|s| if s == id { None } else { Some((s, 1)) })
+                .collect(),
+            match_slot: (0..population)
+                .filter_map(|s| if s == id { None } else { Some((s, 0)) })
+                .collect(),
+            log_offset: 0,
+            snap_offset: 0,
         })
     }
 
@@ -271,6 +1814,121 @@ impl GenericReplica for RaftReplica {
         &mut self,
         mut rx_term: watch::Receiver<bool>,
     ) -> Result<bool, SummersetError> {
+        // recover state from durable snapshot file
+        self.recover_from_snapshot().await?;
+
+        // recover the tail-piece memory log & state from remaining durable log
+        self.recover_from_log().await?;
+
+        // kick off leader activity hearing timer
+        self.kickoff_hb_hear_timer()?;
+
+        // main event loop
+        let mut paused = false;
+        loop {
+            tokio::select! {
+                // client request batch
+                req_batch = self.external_api.get_req_batch(), if !paused => {
+                    if let Err(e) = req_batch {
+                        pf_error!(self.id; "error getting req batch: {}", e);
+                        continue;
+                    }
+                    let req_batch = req_batch.unwrap();
+                    if let Err(e) = self.handle_req_batch(req_batch) {
+                        pf_error!(self.id; "error handling req batch: {}", e);
+                    }
+                },
+
+                // durable logging result
+                log_result = self.storage_hub.get_result(), if !paused => {
+                    if let Err(e) = log_result {
+                        pf_error!(self.id; "error getting log result: {}", e);
+                        continue;
+                    }
+                    let (action_id, log_result) = log_result.unwrap();
+                    if let Err(e) = self.handle_log_result(action_id, log_result) {
+                        pf_error!(self.id; "error handling log result {}: {}",
+                                           action_id, e);
+                    }
+                },
+
+                // message from peer
+                msg = self.transport_hub.recv_msg(), if !paused => {
+                    if let Err(e) = msg {
+                        pf_error!(self.id; "error receiving peer msg: {}", e);
+                        continue;
+                    }
+                    let (peer, msg) = msg.unwrap();
+                    if let Err(e) = self.handle_msg_recv(peer, msg).await {
+                        pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e);
+                    }
+                },
+
+                // state machine execution result
+                cmd_result = self.state_machine.get_result(), if !paused => {
+                    if let Err(e) = cmd_result {
+                        pf_error!(self.id; "error getting cmd result: {}", e);
+                        continue;
+                    }
+                    let (cmd_id, cmd_result) = cmd_result.unwrap();
+                    if let Err(e) = self.handle_cmd_result(cmd_id, cmd_result) {
+                        pf_error!(self.id; "error handling cmd result {}: {}", cmd_id, e);
+                    }
+                },
+
+                // leader inactivity timeout
+                _ = self.hb_hear_timer.timeout(), if !paused => {
+                    if let Err(e) = self.become_a_candidate().await {
+                        pf_error!(self.id; "error becoming a candidate: {}", e);
+                    }
+                },
+
+                // leader sending heartbeat
+                _ = self.hb_send_interval.tick(), if !paused
+                                                     && self.role == Role::Leader => {
+                    if let Err(e) = self.bcast_heartbeats() {
+                        pf_error!(self.id; "error broadcasting heartbeats: {}", e);
+                    }
+                },
+
+                // autonomous snapshot taking timeout
+                _ = self.snapshot_interval.tick(), if !paused
+                        && self.config.snapshot_interval_s > 0 => {
+                    if let Err(e) = self.take_new_snapshot().await {
+                        pf_error!(self.id; "error taking a new snapshot: {}", e);
+                    } else {
+                        self.control_hub.send_ctrl(
+                            CtrlMsg::SnapshotUpTo { new_start: self.start_slot }
+                        )?;
+                    }
+                },
+
+                // manager control message
+                ctrl_msg = self.control_hub.recv_ctrl() => {
+                    if let Err(e) = ctrl_msg {
+                        pf_error!(self.id; "error getting ctrl msg: {}", e);
+                        continue;
+                    }
+                    let ctrl_msg = ctrl_msg.unwrap();
+                    match self.handle_ctrl_msg(ctrl_msg, &mut paused).await {
+                        Ok(terminate) => {
+                            if let Some(restart) = terminate {
+                                return Ok(restart);
+                            }
+                        },
+                        Err(e) => {
+                            pf_error!(self.id; "error handling ctrl msg: {}", e);
+                        }
+                    }
+                },
+
+                // receiving termination signal
+                _ = rx_term.changed() => {
+                    pf_warn!(self.id; "server caught termination signal");
+                    return Ok(false);
+                }
+            }
+        }
     }
 
     fn id(&self) -> ReplicaId {
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 3280baf1..d10bf38f 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -1879,7 +1879,7 @@ impl RSPaxosReplica {
         Ok(())
     }
 
-    /// Take a snapshot up to current exec_idx, then discard the in-mem log up
+    /// Take a snapshot up to current exec_bar, then discard the in-mem log up
     /// to that index as well as outdate entries in the durable WAL log file.
     ///
     /// NOTE: the current implementation does not guard against crashes in the
@@ -1924,7 +1924,7 @@ impl RSPaxosReplica {
                 offset_ok: true, ..
             } => {}
             _ => {
-                return logged_err!(self.id; "unexpected log result type or failed truncate");
+                return logged_err!(self.id; "unexpected log result type or failed write");
             }
         }
 
@@ -2043,7 +2043,7 @@ impl RSPaxosReplica {
                     self.snap_offset = now_size;
                     Ok(())
                 } else {
-                    logged_err!(self.id; "unexpected log result type or failed truncate")
+                    logged_err!(self.id; "unexpected log result type or failed write")
                 }
             }
 

From 63591b66127ac8cae988b80d1a15d8c5266f905f Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 8 Oct 2023 16:41:38 -0500
Subject: [PATCH 86/89] finish raft impl & debugging

---
 README.md                                   |   5 +-
 src/manager/reigner.rs                      |   2 +-
 src/protocols/crossword.rs                  |  14 +-
 src/protocols/multipaxos.rs                 |  14 +-
 src/protocols/raft.rs                       | 238 ++++++++++++++------
 src/protocols/rs_paxos.rs                   |  14 +-
 src/server/storage.rs                       |  79 +++++--
 summerset_client/src/drivers/closed_loop.rs | 148 ++++++------
 summerset_client/src/drivers/open_loop.rs   |  60 ++---
 9 files changed, 379 insertions(+), 195 deletions(-)

diff --git a/README.md b/README.md
index 89481b4a..79a88675 100644
--- a/README.md
+++ b/README.md
@@ -160,8 +160,9 @@ Complete cluster management and benchmarking scripts are available in another re
   - [x] TLA+ spec
 - [x] implementation of RS-Paxos
   - [ ] TLA+ spec
-- [ ] implementation of Raft
-  - [ ] snapshotting & garbage collection
+- [x] implementation of Raft
+  - [x] state persistence & restart check
+  - [x] snapshotting & garbage collection
   - [ ] membership discovery & view changes
   - [ ] TLA+ spec
 - [ ] implementation of CRaft
diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs
index 41ae38ec..3be28cde 100644
--- a/src/manager/reigner.rs
+++ b/src/manager/reigner.rs
@@ -21,7 +21,7 @@ use tokio::task::JoinHandle;
 
 /// Control message from/to servers. Control traffic could be bidirectional:
 /// some initiated by the manager and some by servers.
-// TODO: later add leader change, membership change, etc.
+// TODO: later add membership/view change, link drop, etc.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub enum CtrlMsg {
     /// Server -> Manager: new server up, requesting a list of peers' addresses
diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 71942eb5..41d44a9b 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -1735,6 +1735,8 @@ impl CrosswordReplica {
     /// Chooses a random hb_hear_timeout from the min-max range and kicks off
     /// the hb_hear_timer.
     fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> {
+        self.hb_hear_timer.cancel()?;
+
         let timeout_ms = thread_rng().gen_range(
             self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max,
         );
@@ -2028,7 +2030,6 @@ impl CrosswordReplica {
         pf_warn!(self.id; "server got resume req");
 
         // reset leader heartbeat timer
-        self.hb_hear_timer.cancel()?;
         self.kickoff_hb_hear_timer()?;
 
         *paused = false;
@@ -2300,8 +2301,13 @@ impl CrosswordReplica {
 
         // discard the log before cut_offset
         if cut_offset > 0 {
-            self.storage_hub
-                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Discard {
+                    offset: cut_offset,
+                    keep: 0,
+                },
+            )?;
             loop {
                 let (action_id, log_result) =
                     self.storage_hub.get_result().await?;
@@ -2797,7 +2803,7 @@ impl GenericReplica for CrosswordReplica {
 
                 // autonomous snapshot taking timeout
                 _ = self.snapshot_interval.tick(), if !paused
-                        && self.config.snapshot_interval_s > 0 => {
+                                                      && self.config.snapshot_interval_s > 0 => {
                     if let Err(e) = self.take_new_snapshot().await {
                         pf_error!(self.id; "error taking a new snapshot: {}", e);
                     } else {
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index a19e1d48..76955978 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -1204,6 +1204,8 @@ impl MultiPaxosReplica {
     /// Chooses a random hb_hear_timeout from the min-max range and kicks off
     /// the hb_hear_timer.
     fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> {
+        self.hb_hear_timer.cancel()?;
+
         let timeout_ms = thread_rng().gen_range(
             self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max,
         );
@@ -1331,7 +1333,6 @@ impl MultiPaxosReplica {
         pf_warn!(self.id; "server got resume req");
 
         // reset leader heartbeat timer
-        self.hb_hear_timer.cancel()?;
         self.kickoff_hb_hear_timer()?;
 
         *paused = false;
@@ -1586,8 +1587,13 @@ impl MultiPaxosReplica {
 
         // discard the log before cut_offset
         if cut_offset > 0 {
-            self.storage_hub
-                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Discard {
+                    offset: cut_offset,
+                    keep: 0,
+                },
+            )?;
             loop {
                 let (action_id, log_result) =
                     self.storage_hub.get_result().await?;
@@ -2049,7 +2055,7 @@ impl GenericReplica for MultiPaxosReplica {
 
                 // autonomous snapshot taking timeout
                 _ = self.snapshot_interval.tick(), if !paused
-                        && self.config.snapshot_interval_s > 0 => {
+                                                      && self.config.snapshot_interval_s > 0 => {
                     if let Err(e) = self.take_new_snapshot().await {
                         pf_error!(self.id; "error taking a new snapshot: {}", e);
                     } else {
diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs
index 4693b5cf..5dc54a83 100644
--- a/src/protocols/raft.rs
+++ b/src/protocols/raft.rs
@@ -5,6 +5,7 @@
 //!   - <https://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf>
 //!   - <https://decentralizedthoughts.github.io/2020-12-12-raft-liveness-full-omission/>
 
+use std::cmp;
 use std::collections::{HashMap, HashSet};
 use std::path::Path;
 use std::net::SocketAddr;
@@ -159,6 +160,8 @@ enum PeerMsg {
         prev_term: Term,
         entries: Vec<LogEntry>,
         leader_commit: usize,
+        /// For conservative snapshotting purpose.
+        last_snap: usize,
     },
 
     /// AppendEntries reply from follower to leader.
@@ -277,9 +280,18 @@ pub struct RaftReplica {
     /// For each server, index of the highest log entry known to be replicated.
     match_slot: HashMap<ReplicaId, usize>,
 
+    /// Slot index up to which it is safe to take snapshot.
+    /// NOTE: we are taking a conservative approach here that a snapshot
+    /// covering an entry can be taken only when all servers have durably
+    /// committed that entry.
+    last_snap: usize,
+
     /// Current durable log file end offset.
     log_offset: usize,
 
+    /// Current durable log end of offset of metadata.
+    log_meta_end: usize,
+
     /// Current durable snapshot file offset.
     snap_offset: usize,
 }
@@ -346,6 +358,7 @@ impl RaftReplica {
             self.heard_heartbeat(peer, term)?; // refresh election timer
             if self.role != Role::Follower {
                 self.role = Role::Follower;
+                pf_trace!(self.id; "converted back to follower");
                 Ok(true)
             } else {
                 Ok(false)
@@ -398,7 +411,7 @@ impl RaftReplica {
             term: self.curr_term,
             reqs: req_batch,
             external: true,
-            log_offset: self.log_offset,
+            log_offset: 0,
         };
         let slot = self.start_slot + self.log.len();
         self.log.push(entry.clone());
@@ -434,13 +447,13 @@ impl RaftReplica {
 
         // broadcast AppendEntries messages to followers
         for peer in 0..self.population {
-            if peer == self.id {
+            if peer == self.id || self.next_slot[&peer] < 1 {
                 continue;
             }
 
             let prev_slot = self.next_slot[&peer] - 1;
             if prev_slot < self.start_slot {
-                pf_error!(self.id; "snapshotted slot {} queried", prev_slot);
+                return logged_err!(self.id; "snapshotted slot {} queried", prev_slot);
             }
             let prev_term = self.log[prev_slot - self.start_slot].term;
             let entries = self
@@ -458,6 +471,7 @@ impl RaftReplica {
                         prev_term,
                         entries,
                         leader_commit: self.last_commit,
+                        last_snap: self.last_snap,
                     },
                     peer,
                 )?;
@@ -486,21 +500,6 @@ impl RaftReplica {
                            slot, slot_e);
         assert!(slot <= slot_e);
 
-        // submit newly committed entry for state machine execution
-        if slot > self.last_exec && slot <= self.last_commit {
-            let entry = &self.log[slot - self.start_slot];
-            for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() {
-                if let ApiRequest::Req { cmd, .. } = req {
-                    self.state_machine.submit_cmd(
-                        Self::make_command_id(slot, cmd_idx),
-                        cmd.clone(),
-                    )?;
-                } else {
-                    continue; // ignore other types of requests
-                }
-            }
-        }
-
         // if all consecutive entries are made durable, reply AppendEntries
         // success back to leader
         if slot == slot_e {
@@ -534,10 +533,11 @@ impl RaftReplica {
         assert!(slot_e < self.start_slot + self.log.len());
 
         if let LogResult::Append { now_size } = log_result {
-            assert_eq!(
-                self.log[slot - self.start_slot].log_offset,
-                self.log_offset
-            );
+            let entry = &mut self.log[slot - self.start_slot];
+            if entry.log_offset != self.log_offset {
+                // entry has incorrect log_offset bookkept; update it
+                entry.log_offset = self.log_offset;
+            }
             assert!(now_size > self.log_offset);
             self.log_offset = now_size;
         } else {
@@ -557,6 +557,7 @@ impl RaftReplica {
 // RaftReplica peer-peer messages handling
 impl RaftReplica {
     /// Handler of AppendEntries message from leader.
+    #[allow(clippy::too_many_arguments)]
     async fn handle_msg_append_entries(
         &mut self,
         leader: ReplicaId,
@@ -565,9 +566,12 @@ impl RaftReplica {
         prev_term: Term,
         mut entries: Vec<LogEntry>,
         leader_commit: usize,
+        last_snap: usize,
     ) -> Result<(), SummersetError> {
-        pf_trace!(self.id; "received AcceptEntries <- {} for slot > {} term {}",
-                           leader, prev_slot, term);
+        if !entries.is_empty() {
+            pf_trace!(self.id; "received AcceptEntries <- {} for slots {} - {} term {}",
+                               leader, prev_slot + 1, prev_slot + entries.len(), term);
+        }
         if self.check_term(leader, term)? || self.role != Role::Follower {
             return Ok(());
         }
@@ -587,6 +591,8 @@ impl RaftReplica {
                 },
                 leader,
             )?;
+            pf_trace!(self.id; "sent AcceptEntriesReply -> {} term {} end_slot {} fail",
+                               leader, self.curr_term, prev_slot);
 
             if term >= self.curr_term {
                 // also refresh heartbeat timer here since the "decrementing"
@@ -651,13 +657,14 @@ impl RaftReplica {
 
         // append new entries into my log, and submit logger actions to make
         // new entries durable
-        let (num_entries, mut num_appended) = (0, 0);
+        let (num_entries, mut num_appended) = (entries.len(), 0);
         for (slot, mut entry) in entries
             .drain((first_new - prev_slot - 1)..entries.len())
             .enumerate()
             .map(|(s, e)| (s + first_new, e))
         {
-            entry.external = false; // not from client
+            entry.log_offset = 0;
+
             self.log.push(entry.clone());
             self.storage_hub.submit_action(
                 Self::make_log_action_id(
@@ -670,6 +677,7 @@ impl RaftReplica {
                     sync: self.config.logger_sync,
                 },
             )?;
+
             num_appended += 1;
         }
 
@@ -689,11 +697,29 @@ impl RaftReplica {
 
         // if leader_commit is larger than my last_commit, update last_commit
         if leader_commit > self.last_commit {
-            self.last_commit = if leader_commit < prev_slot + entries.len() {
-                leader_commit
-            } else {
-                prev_slot + entries.len()
-            };
+            let new_commit = cmp::min(leader_commit, prev_slot + entries.len());
+
+            // submit newly committed entries for state machine execution
+            for slot in (self.last_commit + 1)..=new_commit {
+                let entry = &self.log[slot - self.start_slot];
+                for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() {
+                    if let ApiRequest::Req { cmd, .. } = req {
+                        self.state_machine.submit_cmd(
+                            Self::make_command_id(slot, cmd_idx),
+                            cmd.clone(),
+                        )?;
+                    } else {
+                        continue; // ignore other types of requests
+                    }
+                }
+            }
+
+            self.last_commit = new_commit;
+        }
+
+        // if last_snap is larger than mine, update last_snap
+        if last_snap > self.last_snap {
+            self.last_snap = last_snap;
         }
 
         Ok(())
@@ -707,11 +733,14 @@ impl RaftReplica {
         end_slot: usize,
         success: bool,
     ) -> Result<(), SummersetError> {
-        pf_trace!(self.id; "received AcceptEntriesReply <- {} for term {} {}",
-                           peer, term, if success { "ok" } else { "fail" });
+        if !success || self.match_slot[&peer] != end_slot {
+            pf_trace!(self.id; "received AcceptEntriesReply <- {} for term {} {}",
+                               peer, term, if success { "ok" } else { "fail" });
+        }
         if self.check_term(peer, term)? || self.role != Role::Leader {
             return Ok(());
         }
+        self.heard_heartbeat(peer, term)?;
 
         if success {
             // success: update next_slot and match_slot for follower
@@ -720,6 +749,7 @@ impl RaftReplica {
 
             // since we updated some match_slot here, check if any additional
             // entries are now considered committed
+            let mut new_commit = self.last_commit;
             for slot in
                 (self.last_commit + 1)..(self.start_slot + self.log.len())
             {
@@ -734,13 +764,13 @@ impl RaftReplica {
                     .filter(|&&s| s >= slot)
                     .count() as u8;
                 if match_cnt >= self.quorum_cnt {
-                    // quorum size reached, set last_commit to here
-                    self.last_commit = slot;
+                    // quorum size reached, set new_commit to here
+                    new_commit = slot;
                 }
             }
 
             // submit newly committed commands, if any, for execution
-            for slot in (self.last_exec + 1)..=self.last_commit {
+            for slot in (self.last_commit + 1)..=new_commit {
                 let entry = &self.log[slot - self.start_slot];
                 for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() {
                     if let ApiRequest::Req { cmd, .. } = req {
@@ -753,13 +783,34 @@ impl RaftReplica {
                     }
                 }
             }
+
+            self.last_commit = new_commit;
+
+            // also check if any additional entries are safe to snapshot
+            for slot in (self.last_snap + 1)..=end_slot {
+                let match_cnt = 1 + self
+                    .match_slot
+                    .values()
+                    .filter(|&&s| s >= slot)
+                    .count() as u8;
+                if match_cnt == self.population {
+                    // all servers have durably stored this entry
+                    self.last_snap = slot;
+                }
+            }
         } else {
             // failed: decrement next_slot for follower and retry
+            // NOTE: the optimization of fast-backward bypassing (instead of
+            //       always decrementing by 1) not implemented
+            if self.next_slot[&peer] == 1 {
+                return Ok(()); // cannot move backward any more
+            }
             *self.next_slot.get_mut(&peer).unwrap() -= 1;
 
             let prev_slot = self.next_slot[&peer] - 1;
             if prev_slot < self.start_slot {
-                pf_error!(self.id; "snapshotted slot {} queried", prev_slot);
+                *self.next_slot.get_mut(&peer).unwrap() += 1;
+                return logged_err!(self.id; "snapshotted slot {} queried", prev_slot);
             }
             let prev_term = self.log[prev_slot - self.start_slot].term;
             let entries = self
@@ -776,6 +827,7 @@ impl RaftReplica {
                     prev_term,
                     entries,
                     leader_commit: self.last_commit,
+                    last_snap: self.last_snap,
                 },
                 peer,
             )?;
@@ -808,7 +860,7 @@ impl RaftReplica {
                 },
                 candidate,
             )?;
-            pf_trace!(self.id; "sent RequestVote -> {} term {} false",
+            pf_trace!(self.id; "sent RequestVoteReply -> {} term {} false",
                                candidate, self.curr_term);
             return Ok(());
         }
@@ -828,8 +880,12 @@ impl RaftReplica {
                     },
                     candidate,
                 )?;
-                pf_trace!(self.id; "sent RequestVote -> {} term {} granted",
+                pf_trace!(self.id; "sent RequestVoteReply -> {} term {} granted",
                                candidate, self.curr_term);
+
+                // hear a heartbeat here to prevent me from starting an
+                // election soon
+                self.heard_heartbeat(candidate, term)?;
             }
         }
 
@@ -873,6 +929,7 @@ impl RaftReplica {
                 prev_term,
                 entries,
                 leader_commit,
+                last_snap,
             } => {
                 self.handle_msg_append_entries(
                     peer,
@@ -881,6 +938,7 @@ impl RaftReplica {
                     prev_term,
                     entries,
                     leader_commit,
+                    last_snap,
                 )
                 .await
             }
@@ -1024,7 +1082,8 @@ impl RaftReplica {
 
     /// Becomes the leader after enough votes granted for me.
     fn become_the_leader(&mut self) -> Result<(), SummersetError> {
-        pf_info!(self.id; "elected as leader with term {}", self.curr_term);
+        pf_info!(self.id; "elected to be leader with term {}", self.curr_term);
+        self.role = Role::Leader;
 
         // clear peers' heartbeat reply counters, and broadcast a heartbeat now
         for cnts in self.hb_reply_cnts.values_mut() {
@@ -1055,6 +1114,7 @@ impl RaftReplica {
                 prev_term,
                 entries: vec![],
                 leader_commit: self.last_commit,
+                last_snap: self.last_snap,
             },
             None,
         )?;
@@ -1094,6 +1154,8 @@ impl RaftReplica {
     /// Chooses a random hb_hear_timeout from the min-max range and kicks off
     /// the hb_hear_timer.
     fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> {
+        self.hb_hear_timer.cancel()?;
+
         let timeout_ms = thread_rng().gen_range(
             self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max,
         );
@@ -1187,7 +1249,6 @@ impl RaftReplica {
         pf_warn!(self.id; "server got resume req");
 
         // reset leader heartbeat timer
-        self.hb_hear_timer.cancel()?;
         self.kickoff_hb_hear_timer()?;
 
         *paused = false;
@@ -1264,6 +1325,7 @@ impl RaftReplica {
                 end_offset,
             } => {
                 self.log_offset = end_offset;
+                self.log_meta_end = end_offset;
 
                 // recover necessary metadata info
                 self.curr_term = curr_term;
@@ -1286,9 +1348,9 @@ impl RaftReplica {
                             end_offset,
                         } => {
                             entry.log_offset = self.log_offset;
+                            entry.external = false; // no re-replying to clients
                             self.log.push(entry);
-                            // update log offset
-                            self.log_offset = end_offset;
+                            self.log_offset = end_offset; // update log offset
                         }
                         LogResult::Read { entry: None, .. } => {
                             // end of log reached
@@ -1321,10 +1383,18 @@ impl RaftReplica {
                 } = log_result
                 {
                     self.log_offset = now_size;
+                    self.log_meta_end = now_size;
                 } else {
                     return logged_err!(self.id; "unexpected log result type or failed write");
                 }
-                // ... and write the 0-th dummy entry
+                // ... and push a 0-th dummy entry into in-mem log
+                self.log.push(LogEntry {
+                    term: 0,
+                    reqs: vec![],
+                    external: false,
+                    log_offset: 0,
+                });
+                // ... and write the 0-th dummy entry durably
                 self.storage_hub.submit_action(
                     0,
                     LogAction::Write {
@@ -1382,10 +1452,13 @@ impl RaftReplica {
 // RaftReplica snapshotting & GC logic
 impl RaftReplica {
     /// Dump new key-value pairs to snapshot file.
-    async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
+    async fn snapshot_dump_kv_pairs(
+        &mut self,
+        new_start_slot: usize,
+    ) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
         let mut pairs = HashMap::new();
-        for slot in self.start_slot..self.last_exec {
+        for slot in self.start_slot..new_start_slot {
             let entry = &self.log[slot - self.start_slot];
             for (_, req) in entry.reqs.clone() {
                 if let ApiRequest::Req {
@@ -1420,13 +1493,26 @@ impl RaftReplica {
 
     /// Discard everything lower than start_slot in durable log.
     async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> {
+        // drain things currently in storage_hub's recv chan if head of log's
+        // durable file offset has not been set yet
         assert!(!self.log.is_empty());
+        while self.log[0].log_offset == 0 {
+            let (action_id, log_result) = self.storage_hub.get_result().await?;
+            self.handle_log_result(action_id, log_result)?;
+        }
         let cut_offset = self.log[0].log_offset;
 
-        // discard the log before cut_offset
+        // discard the log after meta_end and before cut_offset
         if cut_offset > 0 {
-            self.storage_hub
-                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
+            assert!(self.log_meta_end > 0);
+            assert!(self.log_meta_end <= cut_offset);
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Discard {
+                    offset: cut_offset,
+                    keep: self.log_meta_end,
+                },
+            )?;
             loop {
                 let (action_id, log_result) =
                     self.storage_hub.get_result().await?;
@@ -1439,7 +1525,10 @@ impl RaftReplica {
                         now_size,
                     } = log_result
                     {
-                        assert_eq!(self.log_offset - cut_offset, now_size);
+                        assert_eq!(
+                            self.log_offset - cut_offset + self.log_meta_end,
+                            now_size
+                        );
                         self.log_offset = now_size;
                     } else {
                         return logged_err!(
@@ -1456,7 +1545,7 @@ impl RaftReplica {
         for entry in &mut self.log {
             if entry.log_offset > 0 {
                 assert!(entry.log_offset >= cut_offset);
-                entry.log_offset -= cut_offset;
+                entry.log_offset -= cut_offset - self.log_meta_end;
             }
         }
 
@@ -1472,15 +1561,18 @@ impl RaftReplica {
     ///
     /// NOTE: the current implementation does not take care of InstallSnapshot
     /// messages (which is needed when some lagging follower has some slot
-    /// which all other peers have snapshotted); we assume here that failed
-    /// Accept messages will be retried indefinitely until success before its
-    /// associated data gets discarded from leader's memory.
+    /// which all other peers have snapshotted); we take the conservative
+    /// approach that a snapshot is only taken when data has been durably
+    /// committed on all servers.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
-        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
-                           self.start_slot, self.last_exec);
+        pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}",
+                           self.start_slot, self.last_exec, self.last_snap);
         assert!(self.last_exec + 1 >= self.start_slot);
-        if self.last_exec < self.start_slot + 1 {
-            // always keep at least one entry in log to make indexing happy
+
+        // always keep at least one entry in log to make indexing happy
+        let new_start_slot = cmp::min(self.last_snap, self.last_exec);
+        assert!(new_start_slot < self.start_slot + self.log.len());
+        if new_start_slot < self.start_slot + 1 {
             return Ok(());
         }
 
@@ -1489,14 +1581,14 @@ impl RaftReplica {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
-        self.snapshot_dump_kv_pairs().await?;
+        self.snapshot_dump_kv_pairs(new_start_slot).await?;
 
         // write new slot info entry to the head of snapshot
         self.snapshot_hub.submit_action(
             0,
             LogAction::Write {
                 entry: SnapEntry::SlotInfo {
-                    start_slot: self.last_exec,
+                    start_slot: new_start_slot,
                 },
                 offset: 0,
                 sync: self.config.logger_sync,
@@ -1513,9 +1605,9 @@ impl RaftReplica {
         }
 
         // update start_slot and discard all in-mem log entries up to
-        // last_exec - 1
-        self.log.drain(0..(self.last_exec - self.start_slot));
-        self.start_slot = self.last_exec;
+        // new_start_slot
+        self.log.drain(0..(new_start_slot - self.start_slot));
+        self.start_slot = new_start_slot;
 
         // discarding everything lower than start_slot in durable log
         if self.role == Role::Leader {
@@ -1550,6 +1642,11 @@ impl RaftReplica {
 
                 // recover start_slot info
                 self.start_slot = start_slot;
+                if start_slot > 0 {
+                    self.last_commit = start_slot - 1;
+                    self.last_exec = start_slot - 1;
+                    self.last_snap = start_slot - 1;
+                }
 
                 // repeatedly apply key-value pairs
                 loop {
@@ -1601,11 +1698,11 @@ impl RaftReplica {
             }
 
             LogResult::Read { entry: None, .. } => {
-                // snapshot file is empty. Write a 1 as start_slot and return
+                // snapshot file is empty. Write a 0 as start_slot and return
                 self.snapshot_hub.submit_action(
                     0,
                     LogAction::Write {
-                        entry: SnapEntry::SlotInfo { start_slot: 1 },
+                        entry: SnapEntry::SlotInfo { start_slot: 0 },
                         offset: 0,
                         sync: self.config.logger_sync,
                     },
@@ -1789,12 +1886,7 @@ impl GenericReplica for RaftReplica {
             curr_term: 0,
             voted_for: None,
             votes_granted: HashSet::new(),
-            log: vec![LogEntry {
-                term: 0,
-                reqs: vec![],
-                external: false,
-                log_offset: 0,
-            }],
+            log: vec![],
             start_slot: 0,
             snapshot_interval,
             last_commit: 0,
@@ -1805,7 +1897,9 @@ impl GenericReplica for RaftReplica {
             match_slot: (0..population)
                 .filter_map(|s| if s == id { None } else { Some((s, 0)) })
                 .collect(),
+            last_snap: 0,
             log_offset: 0,
+            log_meta_end: 0,
             snap_offset: 0,
         })
     }
@@ -1893,7 +1987,7 @@ impl GenericReplica for RaftReplica {
 
                 // autonomous snapshot taking timeout
                 _ = self.snapshot_interval.tick(), if !paused
-                        && self.config.snapshot_interval_s > 0 => {
+                                                      && self.config.snapshot_interval_s > 0 => {
                     if let Err(e) = self.take_new_snapshot().await {
                         pf_error!(self.id; "error taking a new snapshot: {}", e);
                     } else {
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index d10bf38f..0f7022f4 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -1442,6 +1442,8 @@ impl RSPaxosReplica {
     /// Chooses a random hb_hear_timeout from the min-max range and kicks off
     /// the hb_hear_timer.
     fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> {
+        self.hb_hear_timer.cancel()?;
+
         let timeout_ms = thread_rng().gen_range(
             self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max,
         );
@@ -1569,7 +1571,6 @@ impl RSPaxosReplica {
         pf_warn!(self.id; "server got resume req");
 
         // reset leader heartbeat timer
-        self.hb_hear_timer.cancel()?;
         self.kickoff_hb_hear_timer()?;
 
         *paused = false;
@@ -1841,8 +1842,13 @@ impl RSPaxosReplica {
 
         // discard the log before cut_offset
         if cut_offset > 0 {
-            self.storage_hub
-                .submit_action(0, LogAction::Discard { offset: cut_offset })?;
+            self.storage_hub.submit_action(
+                0,
+                LogAction::Discard {
+                    offset: cut_offset,
+                    keep: 0,
+                },
+            )?;
             loop {
                 let (action_id, log_result) =
                     self.storage_hub.get_result().await?;
@@ -2325,7 +2331,7 @@ impl GenericReplica for RSPaxosReplica {
 
                 // autonomous snapshot taking timeout
                 _ = self.snapshot_interval.tick(), if !paused
-                        && self.config.snapshot_interval_s > 0 => {
+                                                      && self.config.snapshot_interval_s > 0 => {
                     if let Err(e) = self.take_new_snapshot().await {
                         pf_error!(self.id; "error taking a new snapshot: {}", e);
                     } else {
diff --git a/src/server/storage.rs b/src/server/storage.rs
index a11d6ba6..06bc0430 100644
--- a/src/server/storage.rs
+++ b/src/server/storage.rs
@@ -44,8 +44,9 @@ pub enum LogAction<Ent> {
     /// Truncate the log at given offset, keeping the head part.
     Truncate { offset: usize },
 
-    /// Discard the log before given offset, keeping the tail part.
-    Discard { offset: usize },
+    /// Discard the log before given offset, keeping the tail part (and
+    /// optionally a head part).
+    Discard { offset: usize, keep: usize },
 }
 
 /// Action result returned by the logger.
@@ -337,12 +338,14 @@ where
         }
     }
 
-    /// Discard the file before given index, keeping the tail part.
+    /// Discard the file before given index, keeping the tail part (and
+    /// optionally a head part).
     async fn discard_log(
         me: ReplicaId,
         backer: &mut File,
         file_size: usize,
         offset: usize,
+        keep: usize,
     ) -> Result<(bool, usize), SummersetError> {
         if offset > file_size {
             pf_warn!(
@@ -352,25 +355,32 @@ where
                 file_size
             );
             Ok((false, file_size))
+        } else if keep >= offset {
+            pf_warn!(
+                me;
+                "discard keeping {} while offset is {}",
+                keep, offset
+            );
+            Ok((false, file_size))
         } else {
             let tail_size = file_size - offset;
             if tail_size > 0 {
                 // due to the limited interfaces provided by `tokio::fs`, we
-                // read out the tail part and write it back to offset 0 to
+                // read out the tail part and write it back to offset keep to
                 // achieve the effect of discarding
                 let mut tail_buf: Vec<u8> = vec![0; tail_size];
                 backer.seek(SeekFrom::Start(offset as u64)).await?;
                 backer.read_exact(&mut tail_buf[..]).await?;
 
-                backer.seek(SeekFrom::Start(0)).await?;
+                backer.seek(SeekFrom::Start(keep as u64)).await?;
                 backer.write_all(&tail_buf[..]).await?;
             }
 
-            backer.set_len(tail_size as u64).await?;
+            backer.set_len((keep + tail_size) as u64).await?;
             backer.seek(SeekFrom::End(0)).await?; // recover cursor to EOF
 
             backer.sync_all().await?;
-            Ok((true, tail_size))
+            Ok((true, keep + tail_size))
         }
     }
 
@@ -422,16 +432,16 @@ where
                         }
                     })
             }
-            LogAction::Discard { offset } => {
-                Self::discard_log(me, backer, *file_size, offset).await.map(
-                    |(offset_ok, now_size)| {
+            LogAction::Discard { offset, keep } => {
+                Self::discard_log(me, backer, *file_size, offset, keep)
+                    .await
+                    .map(|(offset_ok, now_size)| {
                         *file_size = now_size;
                         LogResult::Discard {
                             offset_ok,
                             now_size,
                         }
-                    },
-                )
+                    })
             }
         }
     }
@@ -658,24 +668,55 @@ mod storage_tests {
         let mut backer_file =
             prepare_test_file("/tmp/test-backer-4.log").await?;
         let entry = TestEntry("test-entry-dummy-string".into());
-        let mid_offset =
+        let mid1_offset =
             StorageHub::append_entry(0, &mut backer_file, 0, &entry, false)
                 .await?;
+        let mid2_offset = StorageHub::append_entry(
+            0,
+            &mut backer_file,
+            mid1_offset,
+            &entry,
+            false,
+        )
+        .await?;
         let end_offset = StorageHub::append_entry(
             0,
             &mut backer_file,
-            mid_offset,
+            mid2_offset,
             &entry,
             true,
         )
         .await?;
-        let tail_size = end_offset - mid_offset;
+        let tail_size = end_offset - mid2_offset;
         assert_eq!(
             StorageHub::<TestEntry>::discard_log(
                 0,
                 &mut backer_file,
                 end_offset,
-                mid_offset
+                mid2_offset,
+                mid1_offset,
+            )
+            .await?,
+            (true, 2 * tail_size)
+        );
+        assert_eq!(
+            StorageHub::<TestEntry>::discard_log(
+                0,
+                &mut backer_file,
+                2 * tail_size,
+                mid1_offset,
+                end_offset,
+            )
+            .await?,
+            (false, 2 * tail_size)
+        );
+        assert_eq!(
+            StorageHub::<TestEntry>::discard_log(
+                0,
+                &mut backer_file,
+                2 * tail_size,
+                mid1_offset,
+                0,
             )
             .await?,
             (true, tail_size)
@@ -685,7 +726,8 @@ mod storage_tests {
                 0,
                 &mut backer_file,
                 tail_size,
-                end_offset
+                end_offset,
+                0
             )
             .await?,
             (false, tail_size)
@@ -695,7 +737,8 @@ mod storage_tests {
                 0,
                 &mut backer_file,
                 tail_size,
-                tail_size
+                tail_size,
+                0
             )
             .await?,
             (true, 0)
diff --git a/summerset_client/src/drivers/closed_loop.rs b/summerset_client/src/drivers/closed_loop.rs
index a0a96e87..06e218df 100644
--- a/summerset_client/src/drivers/closed_loop.rs
+++ b/summerset_client/src/drivers/closed_loop.rs
@@ -99,46 +99,55 @@ impl DriverClosedLoop {
         })?;
         let issue_ts = Instant::now();
 
-        let reply = self.recv_reply_with_timeout().await?;
-        match reply {
-            Some(ApiReply::Reply {
-                id: reply_id,
-                result: cmd_result,
-                redirect,
-            }) => {
-                if reply_id != req_id {
-                    logged_err!(self.id; "request ID mismatch: expected {}, replied {}",
-                                         req_id, reply_id)
-                } else {
-                    match cmd_result {
-                        None => {
-                            if let Some(server) = redirect {
-                                Ok(DriverReply::Redirect { server })
-                            } else {
-                                Ok(DriverReply::Failure)
+        loop {
+            let reply = self.recv_reply_with_timeout().await?;
+            match reply {
+                Some(ApiReply::Reply {
+                    id: reply_id,
+                    result: cmd_result,
+                    redirect,
+                }) => {
+                    if reply_id != req_id {
+                        // logged_err!(self.id; "request ID mismatch: expected {}, replied {}",
+                        //                      req_id, reply_id)
+                        continue;
+                    } else {
+                        match cmd_result {
+                            None => {
+                                if let Some(server) = redirect {
+                                    return Ok(DriverReply::Redirect {
+                                        server,
+                                    });
+                                } else {
+                                    return Ok(DriverReply::Failure);
+                                }
                             }
-                        }
 
-                        Some(CommandResult::Get { value }) => {
-                            let latency =
-                                Instant::now().duration_since(issue_ts);
-                            Ok(DriverReply::Success {
-                                req_id,
-                                cmd_result: CommandResult::Get { value },
-                                latency,
-                            })
-                        }
+                            Some(CommandResult::Get { value }) => {
+                                let latency =
+                                    Instant::now().duration_since(issue_ts);
+                                return Ok(DriverReply::Success {
+                                    req_id,
+                                    cmd_result: CommandResult::Get { value },
+                                    latency,
+                                });
+                            }
 
-                        _ => {
-                            logged_err!(self.id; "command type mismatch: expected Get")
+                            _ => {
+                                return logged_err!(self.id; "command type mismatch: expected Get");
+                            }
                         }
                     }
                 }
-            }
 
-            None => Ok(DriverReply::Timeout),
+                None => {
+                    return Ok(DriverReply::Timeout);
+                }
 
-            _ => logged_err!(self.id; "unexpected reply type received"),
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
+                }
+            }
         }
     }
 
@@ -160,46 +169,57 @@ impl DriverClosedLoop {
         })?;
         let issue_ts = Instant::now();
 
-        let reply = self.recv_reply_with_timeout().await?;
-        match reply {
-            Some(ApiReply::Reply {
-                id: reply_id,
-                result: cmd_result,
-                redirect,
-            }) => {
-                if reply_id != req_id {
-                    logged_err!(self.id; "request ID mismatch: expected {}, replied {}",
-                                         req_id, reply_id)
-                } else {
-                    match cmd_result {
-                        None => {
-                            if let Some(server) = redirect {
-                                Ok(DriverReply::Redirect { server })
-                            } else {
-                                Ok(DriverReply::Failure)
+        loop {
+            let reply = self.recv_reply_with_timeout().await?;
+            match reply {
+                Some(ApiReply::Reply {
+                    id: reply_id,
+                    result: cmd_result,
+                    redirect,
+                }) => {
+                    if reply_id != req_id {
+                        // logged_err!(self.id; "request ID mismatch: expected {}, replied {}",
+                        //                      req_id, reply_id)
+                        continue;
+                    } else {
+                        match cmd_result {
+                            None => {
+                                if let Some(server) = redirect {
+                                    return Ok(DriverReply::Redirect {
+                                        server,
+                                    });
+                                } else {
+                                    return Ok(DriverReply::Failure);
+                                }
                             }
-                        }
 
-                        Some(CommandResult::Put { old_value }) => {
-                            let latency =
-                                Instant::now().duration_since(issue_ts);
-                            Ok(DriverReply::Success {
-                                req_id,
-                                cmd_result: CommandResult::Put { old_value },
-                                latency,
-                            })
-                        }
+                            Some(CommandResult::Put { old_value }) => {
+                                let latency =
+                                    Instant::now().duration_since(issue_ts);
+                                return Ok(DriverReply::Success {
+                                    req_id,
+                                    cmd_result: CommandResult::Put {
+                                        old_value,
+                                    },
+                                    latency,
+                                });
+                            }
 
-                        _ => {
-                            logged_err!(self.id; "command type mismatch: expected Put")
+                            _ => {
+                                return logged_err!(self.id; "command type mismatch: expected Put");
+                            }
                         }
                     }
                 }
-            }
 
-            None => Ok(DriverReply::Timeout),
+                None => {
+                    return Ok(DriverReply::Timeout);
+                }
 
-            _ => logged_err!(self.id; "unexpected reply type received"),
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
+                }
+            }
         }
     }
 
diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs
index 8e49c107..37a902d5 100644
--- a/summerset_client/src/drivers/open_loop.rs
+++ b/summerset_client/src/drivers/open_loop.rs
@@ -168,37 +168,45 @@ impl DriverOpenLoop {
 
     /// Waits for the next reply.
     pub async fn wait_reply(&mut self) -> Result<DriverReply, SummersetError> {
-        let reply = self.recv_reply_with_timeout().await?;
-        match reply {
-            Some(ApiReply::Reply {
-                id: reply_id,
-                result: cmd_result,
-                redirect,
-            }) => {
-                if !self.pending_reqs.contains_key(&reply_id) {
-                    logged_err!(self.id; "request ID {} not in pending set",
-                                         reply_id)
-                } else {
-                    let issue_ts = self.pending_reqs.remove(&reply_id).unwrap();
-                    let latency = Instant::now().duration_since(issue_ts);
-
-                    if let Some(res) = cmd_result {
-                        Ok(DriverReply::Success {
-                            req_id: reply_id,
-                            cmd_result: res,
-                            latency,
-                        })
-                    } else if let Some(server) = redirect {
-                        Ok(DriverReply::Redirect { server })
+        loop {
+            let reply = self.recv_reply_with_timeout().await?;
+            match reply {
+                Some(ApiReply::Reply {
+                    id: reply_id,
+                    result: cmd_result,
+                    redirect,
+                }) => {
+                    if !self.pending_reqs.contains_key(&reply_id) {
+                        // logged_err!(self.id; "request ID {} not in pending set",
+                        //                      reply_id)
+                        continue;
                     } else {
-                        Ok(DriverReply::Failure)
+                        let issue_ts =
+                            self.pending_reqs.remove(&reply_id).unwrap();
+                        let latency = Instant::now().duration_since(issue_ts);
+
+                        if let Some(res) = cmd_result {
+                            return Ok(DriverReply::Success {
+                                req_id: reply_id,
+                                cmd_result: res,
+                                latency,
+                            });
+                        } else if let Some(server) = redirect {
+                            return Ok(DriverReply::Redirect { server });
+                        } else {
+                            return Ok(DriverReply::Failure);
+                        }
                     }
                 }
-            }
 
-            None => Ok(DriverReply::Timeout),
+                None => {
+                    return Ok(DriverReply::Timeout);
+                }
 
-            _ => logged_err!(self.id; "unexpected reply type received"),
+                _ => {
+                    return logged_err!(self.id; "unexpected reply type received");
+                }
+            }
         }
     }
 

From 5046591f509833e47c6305eb779bc00a24e3c250 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 8 Oct 2023 17:51:29 -0500
Subject: [PATCH 87/89] make Paxos variants snapshotting conservative

---
 src/protocols/crossword.rs  | 91 +++++++++++++++++++++++++++++++------
 src/protocols/multipaxos.rs | 91 +++++++++++++++++++++++++++++++------
 src/protocols/rs_paxos.rs   | 91 +++++++++++++++++++++++++++++++------
 3 files changed, 231 insertions(+), 42 deletions(-)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index 41d44a9b..dbf44a3e 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -4,6 +4,7 @@
 //! dynamically tunable shard assignment with the correct liveness constraints,
 //! plus follower gossiping for actual usability.
 
+use std::cmp;
 use std::collections::HashMap;
 use std::path::Path;
 use std::net::SocketAddr;
@@ -263,7 +264,13 @@ enum PeerMsg {
     },
 
     /// Leader activity heartbeat.
-    Heartbeat { ballot: Ballot, exec_bar: usize },
+    Heartbeat {
+        ballot: Ballot,
+        /// For leader step-up as well as conservative snapshotting purpose.
+        exec_bar: usize,
+        /// For conservative snapshotting purpose.
+        snap_bar: usize,
+    },
 }
 
 /// Crossword server replica module.
@@ -352,6 +359,16 @@ pub struct CrosswordReplica {
     /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len()
     exec_bar: usize,
 
+    /// Map from peer ID -> its latest exec_bar I know; this is for conservative
+    /// snapshotting purpose.
+    peer_exec_bar: HashMap<ReplicaId, usize>,
+
+    /// Slot index before which it is safe to take snapshot.
+    /// NOTE: we are taking a conservative approach here that a snapshot
+    /// covering an entry can be taken only when all servers have durably
+    /// committed (and executed) that entry.
+    snap_bar: usize,
+
     /// Current durable WAL log file offset.
     wal_offset: usize,
 
@@ -1484,9 +1501,11 @@ impl CrosswordReplica {
             PeerMsg::ReconstructReply { slots_data } => {
                 self.handle_msg_reconstruct_reply(peer, slots_data)
             }
-            PeerMsg::Heartbeat { ballot, exec_bar } => {
-                self.heard_heartbeat(peer, ballot, exec_bar)
-            }
+            PeerMsg::Heartbeat {
+                ballot,
+                exec_bar,
+                snap_bar,
+            } => self.heard_heartbeat(peer, ballot, exec_bar, snap_bar),
         }
     }
 }
@@ -1591,6 +1610,11 @@ impl CrosswordReplica {
         }
         self.bcast_heartbeats()?;
 
+        // re-initialize peer_exec_bar information
+        for slot in self.peer_exec_bar.values_mut() {
+            *slot = 0;
+        }
+
         // make a greater ballot number and invalidate all in-progress instances
         self.bal_prepared = 0;
         self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
@@ -1684,6 +1708,7 @@ impl CrosswordReplica {
             PeerMsg::Heartbeat {
                 ballot: self.bal_prep_sent,
                 exec_bar: self.exec_bar,
+                snap_bar: self.snap_bar,
             },
             None,
         )?;
@@ -1714,7 +1739,12 @@ impl CrosswordReplica {
         }
 
         // I also heard this heartbeat from myself
-        self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
+        self.heard_heartbeat(
+            self.id,
+            self.bal_prep_sent,
+            self.exec_bar,
+            self.snap_bar,
+        )?;
 
         // check if we need to fall back to a config with smaller fast-path
         // quorum size
@@ -1755,6 +1785,7 @@ impl CrosswordReplica {
         peer: ReplicaId,
         ballot: Ballot,
         exec_bar: usize,
+        snap_bar: usize,
     ) -> Result<(), SummersetError> {
         if peer != self.id {
             self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1;
@@ -1778,10 +1809,27 @@ impl CrosswordReplica {
                 PeerMsg::Heartbeat {
                     ballot,
                     exec_bar: self.exec_bar,
+                    snap_bar: self.snap_bar,
                 },
                 peer,
             )?;
 
+            // update peer_exec_bar if larger then known; if all servers'
+            // exec_bar (including myself) have passed a slot, that slot
+            // is definitely safe to be snapshotted
+            if exec_bar > self.peer_exec_bar[&peer] {
+                *self.peer_exec_bar.get_mut(&peer).unwrap() = exec_bar;
+                let passed_cnt = 1 + self
+                    .peer_exec_bar
+                    .values()
+                    .filter(|&&e| e >= exec_bar)
+                    .count() as u8;
+                if passed_cnt == self.population {
+                    // all servers have executed up to exec_bar
+                    self.snap_bar = exec_bar;
+                }
+            }
+
             // if the peer has made a higher ballot number
             if ballot > self.bal_max_seen {
                 self.bal_max_seen = ballot;
@@ -1798,6 +1846,11 @@ impl CrosswordReplica {
             }
         }
 
+        // if snap_bar is larger than mine, update snap_bar
+        if snap_bar > self.snap_bar {
+            self.snap_bar = snap_bar;
+        }
+
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
         Ok(())
     }
@@ -2254,10 +2307,13 @@ impl CrosswordReplica {
 // CrosswordReplica snapshotting & GC logic
 impl CrosswordReplica {
     /// Dump new key-value pairs to snapshot file.
-    async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
+    async fn snapshot_dump_kv_pairs(
+        &mut self,
+        new_start_slot: usize,
+    ) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
         let mut pairs = HashMap::new();
-        for slot in self.start_slot..self.exec_bar {
+        for slot in self.start_slot..new_start_slot {
             let inst = &mut self.insts[slot - self.start_slot];
             assert!(inst.reqs_cw.avail_data_shards() >= self.majority);
             for (_, req) in inst.reqs_cw.get_data()?.clone() {
@@ -2357,10 +2413,12 @@ impl CrosswordReplica {
     /// Accept messages will be retried indefinitely until success before its
     /// associated data gets discarded from leader's memory.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
-        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
-                           self.start_slot, self.exec_bar);
+        pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}",
+                           self.start_slot, self.exec_bar, self.snap_bar);
         assert!(self.exec_bar >= self.start_slot);
-        if self.exec_bar == self.start_slot {
+
+        let new_start_slot = cmp::min(self.snap_bar, self.exec_bar);
+        if new_start_slot == self.start_slot {
             return Ok(());
         }
 
@@ -2369,14 +2427,14 @@ impl CrosswordReplica {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
-        self.snapshot_dump_kv_pairs().await?;
+        self.snapshot_dump_kv_pairs(new_start_slot).await?;
 
         // write new slot info entry to the head of snapshot
         self.snapshot_hub.submit_action(
             0,
             LogAction::Write {
                 entry: SnapEntry::SlotInfo {
-                    start_slot: self.exec_bar,
+                    start_slot: new_start_slot,
                     commit_bar: self.commit_bar,
                 },
                 offset: 0,
@@ -2394,8 +2452,8 @@ impl CrosswordReplica {
         }
 
         // update start_slot and discard all in-memory log instances up to exec_bar
-        self.insts.drain(0..(self.exec_bar - self.start_slot));
-        self.start_slot = self.exec_bar;
+        self.insts.drain(0..(new_start_slot - self.start_slot));
+        self.start_slot = new_start_slot;
 
         // discarding everything older than start_slot in WAL log
         if self.is_leader() {
@@ -2436,6 +2494,7 @@ impl CrosswordReplica {
                 self.start_slot = start_slot;
                 self.commit_bar = commit_bar;
                 self.exec_bar = start_slot;
+                self.snap_bar = start_slot;
 
                 // repeatedly apply key-value pairs
                 loop {
@@ -2712,6 +2771,10 @@ impl GenericReplica for CrosswordReplica {
             bal_max_seen: 0,
             commit_bar: 0,
             exec_bar: 0,
+            peer_exec_bar: (0..population)
+                .filter_map(|s| if s == id { None } else { Some((s, 0)) })
+                .collect(),
+            snap_bar: 0,
             wal_offset: 0,
             snap_offset: 0,
             rs_coder,
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 76955978..417217ab 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -7,6 +7,7 @@
 //!   - <https://github.com/josehu07/learn-tla/tree/main/Dr.-TLA%2B-selected/multipaxos_practical>
 //!   - <https://github.com/efficient/epaxos/blob/master/src/paxos/paxos.go>
 
+use std::cmp;
 use std::collections::HashMap;
 use std::path::Path;
 use std::net::SocketAddr;
@@ -228,7 +229,13 @@ enum PeerMsg {
     Commit { slot: usize },
 
     /// Leader activity heartbeat.
-    Heartbeat { ballot: Ballot, exec_bar: usize },
+    Heartbeat {
+        ballot: Ballot,
+        /// For leader step-up as well as conservative snapshotting purpose.
+        exec_bar: usize,
+        /// For conservative snapshotting purpose.
+        snap_bar: usize,
+    },
 }
 
 /// MultiPaxos server replica module.
@@ -310,6 +317,16 @@ pub struct MultiPaxosReplica {
     /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len()
     exec_bar: usize,
 
+    /// Map from peer ID -> its latest exec_bar I know; this is for conservative
+    /// snapshotting purpose.
+    peer_exec_bar: HashMap<ReplicaId, usize>,
+
+    /// Slot index before which it is safe to take snapshot.
+    /// NOTE: we are taking a conservative approach here that a snapshot
+    /// covering an entry can be taken only when all servers have durably
+    /// committed (and executed) that entry.
+    snap_bar: usize,
+
     /// Current durable WAL log file offset.
     wal_offset: usize,
 
@@ -1012,9 +1029,11 @@ impl MultiPaxosReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
-            PeerMsg::Heartbeat { ballot, exec_bar } => {
-                self.heard_heartbeat(peer, ballot, exec_bar)
-            }
+            PeerMsg::Heartbeat {
+                ballot,
+                exec_bar,
+                snap_bar,
+            } => self.heard_heartbeat(peer, ballot, exec_bar, snap_bar),
         }
     }
 }
@@ -1106,6 +1125,11 @@ impl MultiPaxosReplica {
         }
         self.bcast_heartbeats()?;
 
+        // re-initialize peer_exec_bar information
+        for slot in self.peer_exec_bar.values_mut() {
+            *slot = 0;
+        }
+
         // make a greater ballot number and invalidate all in-progress instances
         self.bal_prepared = 0;
         self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
@@ -1165,6 +1189,7 @@ impl MultiPaxosReplica {
             PeerMsg::Heartbeat {
                 ballot: self.bal_prep_sent,
                 exec_bar: self.exec_bar,
+                snap_bar: self.snap_bar,
             },
             None,
         )?;
@@ -1195,7 +1220,12 @@ impl MultiPaxosReplica {
         }
 
         // I also heard this heartbeat from myself
-        self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
+        self.heard_heartbeat(
+            self.id,
+            self.bal_prep_sent,
+            self.exec_bar,
+            self.snap_bar,
+        )?;
 
         // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
         Ok(())
@@ -1224,6 +1254,7 @@ impl MultiPaxosReplica {
         peer: ReplicaId,
         ballot: Ballot,
         exec_bar: usize,
+        snap_bar: usize,
     ) -> Result<(), SummersetError> {
         if peer != self.id {
             self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1;
@@ -1247,10 +1278,27 @@ impl MultiPaxosReplica {
                 PeerMsg::Heartbeat {
                     ballot,
                     exec_bar: self.exec_bar,
+                    snap_bar: self.snap_bar,
                 },
                 peer,
             )?;
 
+            // update peer_exec_bar if larger then known; if all servers'
+            // exec_bar (including myself) have passed a slot, that slot
+            // is definitely safe to be snapshotted
+            if exec_bar > self.peer_exec_bar[&peer] {
+                *self.peer_exec_bar.get_mut(&peer).unwrap() = exec_bar;
+                let passed_cnt = 1 + self
+                    .peer_exec_bar
+                    .values()
+                    .filter(|&&e| e >= exec_bar)
+                    .count() as u8;
+                if passed_cnt == self.population {
+                    // all servers have executed up to exec_bar
+                    self.snap_bar = exec_bar;
+                }
+            }
+
             // if the peer has made a higher ballot number
             if ballot > self.bal_max_seen {
                 self.bal_max_seen = ballot;
@@ -1267,6 +1315,11 @@ impl MultiPaxosReplica {
             }
         }
 
+        // if snap_bar is larger than mine, update snap_bar
+        if snap_bar > self.snap_bar {
+            self.snap_bar = snap_bar;
+        }
+
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
         Ok(())
     }
@@ -1541,10 +1594,13 @@ impl MultiPaxosReplica {
 // MultiPaxosReplica snapshotting & GC logic
 impl MultiPaxosReplica {
     /// Dump new key-value pairs to snapshot file.
-    async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
+    async fn snapshot_dump_kv_pairs(
+        &mut self,
+        new_start_slot: usize,
+    ) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
         let mut pairs = HashMap::new();
-        for slot in self.start_slot..self.exec_bar {
+        for slot in self.start_slot..new_start_slot {
             let inst = &self.insts[slot - self.start_slot];
             for (_, req) in inst.reqs.clone() {
                 if let ApiRequest::Req {
@@ -1643,10 +1699,12 @@ impl MultiPaxosReplica {
     /// Accept messages will be retried indefinitely until success before its
     /// associated data gets discarded from leader's memory.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
-        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
-                           self.start_slot, self.exec_bar);
+        pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}",
+                           self.start_slot, self.exec_bar, self.snap_bar);
         assert!(self.exec_bar >= self.start_slot);
-        if self.exec_bar == self.start_slot {
+
+        let new_start_slot = cmp::min(self.snap_bar, self.exec_bar);
+        if new_start_slot == self.start_slot {
             return Ok(());
         }
 
@@ -1655,14 +1713,14 @@ impl MultiPaxosReplica {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
-        self.snapshot_dump_kv_pairs().await?;
+        self.snapshot_dump_kv_pairs(new_start_slot).await?;
 
         // write new slot info entry to the head of snapshot
         self.snapshot_hub.submit_action(
             0,
             LogAction::Write {
                 entry: SnapEntry::SlotInfo {
-                    start_slot: self.exec_bar,
+                    start_slot: new_start_slot,
                     commit_bar: self.commit_bar,
                 },
                 offset: 0,
@@ -1680,8 +1738,8 @@ impl MultiPaxosReplica {
         }
 
         // update start_slot and discard all in-memory log instances up to exec_bar
-        self.insts.drain(0..(self.exec_bar - self.start_slot));
-        self.start_slot = self.exec_bar;
+        self.insts.drain(0..(new_start_slot - self.start_slot));
+        self.start_slot = new_start_slot;
 
         // discarding everything older than start_slot in WAL log
         if self.is_leader() {
@@ -1722,6 +1780,7 @@ impl MultiPaxosReplica {
                 self.start_slot = start_slot;
                 self.commit_bar = commit_bar;
                 self.exec_bar = start_slot;
+                self.snap_bar = start_slot;
 
                 // repeatedly apply key-value pairs
                 loop {
@@ -1968,6 +2027,10 @@ impl GenericReplica for MultiPaxosReplica {
             bal_max_seen: 0,
             commit_bar: 0,
             exec_bar: 0,
+            peer_exec_bar: (0..population)
+                .filter_map(|s| if s == id { None } else { Some((s, 0)) })
+                .collect(),
+            snap_bar: 0,
             wal_offset: 0,
             snap_offset: 0,
         })
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 0f7022f4..96aa8127 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -3,6 +3,7 @@
 //! MultiPaxos with Reed-Solomon erasure coding. References:
 //!   - <https://madsys.cs.tsinghua.edu.cn/publications/HPDC2014-mu.pdf>
 
+use std::cmp;
 use std::collections::HashMap;
 use std::path::Path;
 use std::net::SocketAddr;
@@ -243,7 +244,13 @@ enum PeerMsg {
     },
 
     /// Leader activity heartbeat.
-    Heartbeat { ballot: Ballot, exec_bar: usize },
+    Heartbeat {
+        ballot: Ballot,
+        /// For leader step-up as well as conservative snapshotting purpose.
+        exec_bar: usize,
+        /// For conservative snapshotting purpose.
+        snap_bar: usize,
+    },
 }
 
 /// RSPaxos server replica module.
@@ -325,6 +332,16 @@ pub struct RSPaxosReplica {
     /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len()
     exec_bar: usize,
 
+    /// Map from peer ID -> its latest exec_bar I know; this is for conservative
+    /// snapshotting purpose.
+    peer_exec_bar: HashMap<ReplicaId, usize>,
+
+    /// Slot index before which it is safe to take snapshot.
+    /// NOTE: we are taking a conservative approach here that a snapshot
+    /// covering an entry can be taken only when all servers have durably
+    /// committed (and executed) that entry.
+    snap_bar: usize,
+
     /// Current durable WAL log file offset.
     wal_offset: usize,
 
@@ -1232,9 +1249,11 @@ impl RSPaxosReplica {
             PeerMsg::ReconstructReply { slots_data } => {
                 self.handle_msg_reconstruct_reply(peer, slots_data)
             }
-            PeerMsg::Heartbeat { ballot, exec_bar } => {
-                self.heard_heartbeat(peer, ballot, exec_bar)
-            }
+            PeerMsg::Heartbeat {
+                ballot,
+                exec_bar,
+                snap_bar,
+            } => self.heard_heartbeat(peer, ballot, exec_bar, snap_bar),
         }
     }
 }
@@ -1327,6 +1346,11 @@ impl RSPaxosReplica {
         }
         self.bcast_heartbeats()?;
 
+        // re-initialize peer_exec_bar information
+        for slot in self.peer_exec_bar.values_mut() {
+            *slot = 0;
+        }
+
         // make a greater ballot number and invalidate all in-progress instances
         self.bal_prepared = 0;
         self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen);
@@ -1403,6 +1427,7 @@ impl RSPaxosReplica {
             PeerMsg::Heartbeat {
                 ballot: self.bal_prep_sent,
                 exec_bar: self.exec_bar,
+                snap_bar: self.snap_bar,
             },
             None,
         )?;
@@ -1433,7 +1458,12 @@ impl RSPaxosReplica {
         }
 
         // I also heard this heartbeat from myself
-        self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?;
+        self.heard_heartbeat(
+            self.id,
+            self.bal_prep_sent,
+            self.exec_bar,
+            self.snap_bar,
+        )?;
 
         // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent);
         Ok(())
@@ -1462,6 +1492,7 @@ impl RSPaxosReplica {
         peer: ReplicaId,
         ballot: Ballot,
         exec_bar: usize,
+        snap_bar: usize,
     ) -> Result<(), SummersetError> {
         if peer != self.id {
             self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1;
@@ -1485,10 +1516,27 @@ impl RSPaxosReplica {
                 PeerMsg::Heartbeat {
                     ballot,
                     exec_bar: self.exec_bar,
+                    snap_bar: self.snap_bar,
                 },
                 peer,
             )?;
 
+            // update peer_exec_bar if larger then known; if all servers'
+            // exec_bar (including myself) have passed a slot, that slot
+            // is definitely safe to be snapshotted
+            if exec_bar > self.peer_exec_bar[&peer] {
+                *self.peer_exec_bar.get_mut(&peer).unwrap() = exec_bar;
+                let passed_cnt = 1 + self
+                    .peer_exec_bar
+                    .values()
+                    .filter(|&&e| e >= exec_bar)
+                    .count() as u8;
+                if passed_cnt == self.population {
+                    // all servers have executed up to exec_bar
+                    self.snap_bar = exec_bar;
+                }
+            }
+
             // if the peer has made a higher ballot number
             if ballot > self.bal_max_seen {
                 self.bal_max_seen = ballot;
@@ -1505,6 +1553,11 @@ impl RSPaxosReplica {
             }
         }
 
+        // if snap_bar is larger than mine, update snap_bar
+        if snap_bar > self.snap_bar {
+            self.snap_bar = snap_bar;
+        }
+
         // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot);
         Ok(())
     }
@@ -1795,10 +1848,13 @@ impl RSPaxosReplica {
 // RSPaxosReplica snapshotting & GC logic
 impl RSPaxosReplica {
     /// Dump new key-value pairs to snapshot file.
-    async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> {
+    async fn snapshot_dump_kv_pairs(
+        &mut self,
+        new_start_slot: usize,
+    ) -> Result<(), SummersetError> {
         // collect all key-value pairs put up to exec_bar
         let mut pairs = HashMap::new();
-        for slot in self.start_slot..self.exec_bar {
+        for slot in self.start_slot..new_start_slot {
             let inst = &mut self.insts[slot - self.start_slot];
             assert!(inst.reqs_cw.avail_data_shards() >= self.majority);
             for (_, req) in inst.reqs_cw.get_data()?.clone() {
@@ -1898,10 +1954,12 @@ impl RSPaxosReplica {
     /// Accept messages will be retried indefinitely until success before its
     /// associated data gets discarded from leader's memory.
     async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> {
-        pf_debug!(self.id; "taking new snapshot: start {} exec {}",
-                           self.start_slot, self.exec_bar);
+        pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}",
+                           self.start_slot, self.exec_bar, self.snap_bar);
         assert!(self.exec_bar >= self.start_slot);
-        if self.exec_bar == self.start_slot {
+
+        let new_start_slot = cmp::min(self.snap_bar, self.exec_bar);
+        if new_start_slot == self.start_slot {
             return Ok(());
         }
 
@@ -1910,14 +1968,14 @@ impl RSPaxosReplica {
             // NOTE: broadcast heartbeats here to appease followers
             self.bcast_heartbeats()?;
         }
-        self.snapshot_dump_kv_pairs().await?;
+        self.snapshot_dump_kv_pairs(new_start_slot).await?;
 
         // write new slot info entry to the head of snapshot
         self.snapshot_hub.submit_action(
             0,
             LogAction::Write {
                 entry: SnapEntry::SlotInfo {
-                    start_slot: self.exec_bar,
+                    start_slot: new_start_slot,
                     commit_bar: self.commit_bar,
                 },
                 offset: 0,
@@ -1935,8 +1993,8 @@ impl RSPaxosReplica {
         }
 
         // update start_slot and discard all in-memory log instances up to exec_bar
-        self.insts.drain(0..(self.exec_bar - self.start_slot));
-        self.start_slot = self.exec_bar;
+        self.insts.drain(0..(new_start_slot - self.start_slot));
+        self.start_slot = new_start_slot;
 
         // discarding everything older than start_slot in WAL log
         if self.is_leader() {
@@ -1977,6 +2035,7 @@ impl RSPaxosReplica {
                 self.start_slot = start_slot;
                 self.commit_bar = commit_bar;
                 self.exec_bar = start_slot;
+                self.snap_bar = start_slot;
 
                 // repeatedly apply key-value pairs
                 loop {
@@ -2243,6 +2302,10 @@ impl GenericReplica for RSPaxosReplica {
             bal_max_seen: 0,
             commit_bar: 0,
             exec_bar: 0,
+            peer_exec_bar: (0..population)
+                .filter_map(|s| if s == id { None } else { Some((s, 0)) })
+                .collect(),
+            snap_bar: 0,
             wal_offset: 0,
             snap_offset: 0,
             rs_coder,

From 913dfa621a0ed1b4afa0b1e0642f4eb50c51df52 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 8 Oct 2023 18:34:53 -0500
Subject: [PATCH 88/89] add missing hole filling mechanism to Paxos variants

---
 src/protocols/crossword.rs  | 70 +++++++++++++++++++++++++++++++++++++
 src/protocols/multipaxos.rs | 59 +++++++++++++++++++++++++++++++
 src/protocols/rs_paxos.rs   | 62 ++++++++++++++++++++++++++++++++
 3 files changed, 191 insertions(+)

diff --git a/src/protocols/crossword.rs b/src/protocols/crossword.rs
index dbf44a3e..f71a6b5e 100644
--- a/src/protocols/crossword.rs
+++ b/src/protocols/crossword.rs
@@ -251,6 +251,10 @@ enum PeerMsg {
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
 
+    /// Request by a lagging replica to leader asking to re-send Accepts for
+    /// missing holes
+    FillHoles { slots: Vec<usize> },
+
     /// Reconstruction read from new leader to replicas.
     Reconstruct {
         /// Map from slot -> shards to exclude.
@@ -949,6 +953,21 @@ impl CrosswordReplica {
             }
         }
 
+        // if there are hole(s) between current commit_bar and newly committed
+        // slot, ask the leader to re-send Accept messages for those slots
+        if slot > self.commit_bar && !self.is_leader() {
+            if let Some(leader) = self.leader {
+                let holes: Vec<usize> = (self.commit_bar..slot).collect();
+                self.transport_hub.send_msg(
+                    PeerMsg::FillHoles {
+                        slots: holes.clone(),
+                    },
+                    leader,
+                )?;
+                pf_trace!(self.id; "sent FillHoles -> {} slots {:?}", leader, holes);
+            }
+        }
+
         Ok(())
     }
 
@@ -1355,6 +1374,54 @@ impl CrosswordReplica {
         Ok(())
     }
 
+    /// Handler of FillHoles message from a lagging peer.
+    fn handle_msg_fill_holes(
+        &mut self,
+        peer: ReplicaId,
+        slots: Vec<usize>,
+    ) -> Result<(), SummersetError> {
+        if !self.is_leader() {
+            return Ok(());
+        }
+        pf_trace!(self.id; "received FillHoles <- {} for slots {:?}", peer, slots);
+
+        for slot in slots {
+            if slot < self.start_slot {
+                continue;
+            } else if slot >= self.start_slot + self.insts.len() {
+                break;
+            }
+            let inst = &self.insts[slot - self.start_slot];
+
+            if inst.status >= Status::Committed {
+                // re-send Accept message for this slot
+                self.transport_hub.send_msg(
+                    PeerMsg::Accept {
+                        slot,
+                        ballot: self.bal_prepared,
+                        reqs_cw: inst.reqs_cw.subset_copy(
+                            Bitmap::from(
+                                self.population,
+                                Self::shards_for_replica(
+                                    slot,
+                                    peer,
+                                    self.population,
+                                    self.shards_per_replica,
+                                ),
+                            ),
+                            false,
+                        )?,
+                    },
+                    peer,
+                )?;
+                pf_trace!(self.id; "sent Accept -> {} for slot {} bal {}",
+                                   peer, slot, self.bal_prepared);
+            }
+        }
+
+        Ok(())
+    }
+
     /// Handler of Reconstruct message from leader.
     fn handle_msg_reconstruct(
         &mut self,
@@ -1495,6 +1562,9 @@ impl CrosswordReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
+            PeerMsg::FillHoles { slots } => {
+                self.handle_msg_fill_holes(peer, slots)
+            }
             PeerMsg::Reconstruct { slots_excl } => {
                 self.handle_msg_reconstruct(peer, slots_excl)
             }
diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs
index 417217ab..435cbf3c 100644
--- a/src/protocols/multipaxos.rs
+++ b/src/protocols/multipaxos.rs
@@ -228,6 +228,10 @@ enum PeerMsg {
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
 
+    /// Request by a lagging replica to leader asking to re-send Accepts for
+    /// missing holes
+    FillHoles { slots: Vec<usize> },
+
     /// Leader activity heartbeat.
     Heartbeat {
         ballot: Ballot,
@@ -680,6 +684,21 @@ impl MultiPaxosReplica {
             }
         }
 
+        // if there are hole(s) between current commit_bar and newly committed
+        // slot, ask the leader to re-send Accept messages for those slots
+        if slot > self.commit_bar && !self.is_leader() {
+            if let Some(leader) = self.leader {
+                let holes: Vec<usize> = (self.commit_bar..slot).collect();
+                self.transport_hub.send_msg(
+                    PeerMsg::FillHoles {
+                        slots: holes.clone(),
+                    },
+                    leader,
+                )?;
+                pf_trace!(self.id; "sent FillHoles -> {} slots {:?}", leader, holes);
+            }
+        }
+
         Ok(())
     }
 
@@ -1007,6 +1026,43 @@ impl MultiPaxosReplica {
         Ok(())
     }
 
+    /// Handler of FillHoles message from a lagging peer.
+    fn handle_msg_fill_holes(
+        &mut self,
+        peer: ReplicaId,
+        slots: Vec<usize>,
+    ) -> Result<(), SummersetError> {
+        if !self.is_leader() {
+            return Ok(());
+        }
+        pf_trace!(self.id; "received FillHoles <- {} for slots {:?}", peer, slots);
+
+        for slot in slots {
+            if slot < self.start_slot {
+                continue;
+            } else if slot >= self.start_slot + self.insts.len() {
+                break;
+            }
+            let inst = &self.insts[slot - self.start_slot];
+
+            if inst.status >= Status::Committed {
+                // re-send Accept message for this slot
+                self.transport_hub.send_msg(
+                    PeerMsg::Accept {
+                        slot,
+                        ballot: self.bal_prepared,
+                        reqs: inst.reqs.clone(),
+                    },
+                    peer,
+                )?;
+                pf_trace!(self.id; "sent Accept -> {} for slot {} bal {}",
+                                   peer, slot, self.bal_prepared);
+            }
+        }
+
+        Ok(())
+    }
+
     /// Synthesized handler of receiving message from peer.
     fn handle_msg_recv(
         &mut self,
@@ -1029,6 +1085,9 @@ impl MultiPaxosReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
+            PeerMsg::FillHoles { slots } => {
+                self.handle_msg_fill_holes(peer, slots)
+            }
             PeerMsg::Heartbeat {
                 ballot,
                 exec_bar,
diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs
index 96aa8127..31fbcb69 100644
--- a/src/protocols/rs_paxos.rs
+++ b/src/protocols/rs_paxos.rs
@@ -234,6 +234,10 @@ enum PeerMsg {
     /// Commit notification from leader to replicas.
     Commit { slot: usize },
 
+    /// Request by a lagging replica to leader asking to re-send Accepts for
+    /// missing holes
+    FillHoles { slots: Vec<usize> },
+
     /// Reconstruction read from new leader to replicas.
     Reconstruct { slots: Vec<usize> },
 
@@ -739,6 +743,21 @@ impl RSPaxosReplica {
             }
         }
 
+        // if there are hole(s) between current commit_bar and newly committed
+        // slot, ask the leader to re-send Accept messages for those slots
+        if slot > self.commit_bar && !self.is_leader() {
+            if let Some(leader) = self.leader {
+                let holes: Vec<usize> = (self.commit_bar..slot).collect();
+                self.transport_hub.send_msg(
+                    PeerMsg::FillHoles {
+                        slots: holes.clone(),
+                    },
+                    leader,
+                )?;
+                pf_trace!(self.id; "sent FillHoles -> {} slots {:?}", leader, holes);
+            }
+        }
+
         Ok(())
     }
 
@@ -1108,6 +1127,46 @@ impl RSPaxosReplica {
         Ok(())
     }
 
+    /// Handler of FillHoles message from a lagging peer.
+    fn handle_msg_fill_holes(
+        &mut self,
+        peer: ReplicaId,
+        slots: Vec<usize>,
+    ) -> Result<(), SummersetError> {
+        if !self.is_leader() {
+            return Ok(());
+        }
+        pf_trace!(self.id; "received FillHoles <- {} for slots {:?}", peer, slots);
+
+        for slot in slots {
+            if slot < self.start_slot {
+                continue;
+            } else if slot >= self.start_slot + self.insts.len() {
+                break;
+            }
+            let inst = &self.insts[slot - self.start_slot];
+
+            if inst.status >= Status::Committed {
+                // re-send Accept message for this slot
+                self.transport_hub.send_msg(
+                    PeerMsg::Accept {
+                        slot,
+                        ballot: self.bal_prepared,
+                        reqs_cw: inst.reqs_cw.subset_copy(
+                            Bitmap::from(self.population, vec![peer]),
+                            false,
+                        )?,
+                    },
+                    peer,
+                )?;
+                pf_trace!(self.id; "sent Accept -> {} for slot {} bal {}",
+                                   peer, slot, self.bal_prepared);
+            }
+        }
+
+        Ok(())
+    }
+
     /// Handler of Reconstruct message from leader.
     fn handle_msg_reconstruct(
         &mut self,
@@ -1243,6 +1302,9 @@ impl RSPaxosReplica {
                 self.handle_msg_accept_reply(peer, slot, ballot)
             }
             PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot),
+            PeerMsg::FillHoles { slots } => {
+                self.handle_msg_fill_holes(peer, slots)
+            }
             PeerMsg::Reconstruct { slots } => {
                 self.handle_msg_reconstruct(peer, slots)
             }

From 8fd45a3ca4d90c314c5e683ef7d7e4db5e0c38c1 Mon Sep 17 00:00:00 2001
From: Guanzhou Hu <huguanzhou123@sina.com>
Date: Sun, 8 Oct 2023 19:01:01 -0500
Subject: [PATCH 89/89] add Raft to workflow proc tests

---
 .github/workflows/tests_proc.yml |  6 ++++--
 .github/workflows/tests_unit.yml |  2 +-
 scripts/workflow_test.py         | 14 ++++++++++++++
 src/protocols/raft.rs            |  3 ++-
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/tests_proc.yml b/.github/workflows/tests_proc.yml
index e8fbde3f..dbd7195e 100644
--- a/.github/workflows/tests_proc.yml
+++ b/.github/workflows/tests_proc.yml
@@ -16,5 +16,7 @@ jobs:
 
     steps:
     - uses: actions/checkout@v3
-    - name: Run proc tests
-      run: python3 scripts/workflow_test.py
+    - name: Run proc tests (MultiPaxos)
+      run: python3 scripts/workflow_test.py -p MultiPaxos
+    - name: Run proc tests (Raft)
+      run: python3 scripts/workflow_test.py -p Raft
diff --git a/.github/workflows/tests_unit.yml b/.github/workflows/tests_unit.yml
index 0a1fd8d6..57aa8fb3 100644
--- a/.github/workflows/tests_unit.yml
+++ b/.github/workflows/tests_unit.yml
@@ -16,5 +16,5 @@ jobs:
 
     steps:
     - uses: actions/checkout@v3
-    - name: Run unit tests
+    - name: Run all unit tests
       run: cargo test --workspace --verbose
diff --git a/scripts/workflow_test.py b/scripts/workflow_test.py
index 33484aca..eb176a7f 100644
--- a/scripts/workflow_test.py
+++ b/scripts/workflow_test.py
@@ -1,5 +1,6 @@
 import sys
 import os
+import argparse
 import subprocess
 
 
@@ -76,6 +77,12 @@ def run_tester_client(protocol, test_name):
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-p", "--protocol", type=str, required=True, help="protocol name"
+    )
+    args = parser.parse_args()
+
     do_cargo_build()
 
     kill_all_matching("local_client.py", force=True)
@@ -85,6 +92,13 @@ def run_tester_client(protocol, test_name):
     kill_all_matching("summerset_manager", force=True)
 
     PROTOCOL = "MultiPaxos"
+    if args.protocol == "MultiPaxos":
+        pass
+    elif args.protocol == "Raft":
+        PROTOCOL = "Raft"
+    else:
+        raise ValueError(f"unrecognized protocol {args.protocol} to run workflow test")
+
     NUM_REPLICAS = 3
     TEST_NAME = "primitive_ops"
     TIMEOUT = 300
diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs
index 5dc54a83..4ffc04f5 100644
--- a/src/protocols/raft.rs
+++ b/src/protocols/raft.rs
@@ -1427,6 +1427,7 @@ impl RaftReplica {
         }
 
         // do an extra Truncate to remove paritial entry at the end if any
+        assert!(self.log_offset >= self.log_meta_end);
         self.storage_hub.submit_action(
             0,
             LogAction::Truncate {
@@ -1438,7 +1439,7 @@ impl RaftReplica {
             offset_ok: true, ..
         } = log_result
         {
-            if self.log_offset > 0 {
+            if self.log_offset > self.log_meta_end {
                 pf_info!(self.id; "recovered from wal log: term {} voted {:?} |log| {}",
                                   self.curr_term, self.voted_for, self.log.len());
             }