diff --git a/node.go b/node.go index 40e9dab6..a9d8fa12 100644 --- a/node.go +++ b/node.go @@ -441,7 +441,7 @@ func (n *node) run() { } readyc = nil - traceNodeEvent(rsmReady, r) + traceReady(r) case <-advancec: n.rn.Advance(rd) rd = Ready{} diff --git a/raft.go b/raft.go index d7b9d381..4b9ed72c 100644 --- a/raft.go +++ b/raft.go @@ -762,7 +762,7 @@ func (r *raft) appliedSnap(snap *pb.Snapshot) { // the commit index changed (in which case the caller should call // r.bcastAppend). func (r *raft) maybeCommit() bool { - defer traceNodeEvent(rsmCommit, r) + defer traceCommit(r) mci := r.prs.Committed() return r.raftLog.maybeCommit(mci, r.Term) @@ -805,7 +805,7 @@ func (r *raft) appendEntry(es ...pb.Entry) (accepted bool) { es[i].Term = r.Term es[i].Index = li + 1 + uint64(i) if es[i].Type == pb.EntryNormal { - traceNodeEvent(rsmReplicate, r) + traceReplicate(r) } } // Track the size of this uncommitted proposal. @@ -883,7 +883,7 @@ func (r *raft) becomeFollower(term uint64, lead uint64) { r.state = StateFollower r.logger.Infof("%x became follower at term %d", r.id, r.Term) - traceNodeEvent(rsmBecomeFollower, r) + traceBecomeFollower(r) } func (r *raft) becomeCandidate() { @@ -898,7 +898,7 @@ func (r *raft) becomeCandidate() { r.state = StateCandidate r.logger.Infof("%x became candidate at term %d", r.id, r.Term) - traceNodeEvent(rsmBecomeCandidate, r) + traceBecomeCandidate(r) } func (r *raft) becomePreCandidate() { @@ -945,7 +945,7 @@ func (r *raft) becomeLeader() { // could be expensive. r.pendingConfIndex = r.raftLog.lastIndex() - traceNodeEvent(rsmBecomeLeader, r) + traceBecomeLeader(r) emptyEnt := pb.Entry{Data: nil} if !r.appendEntry(emptyEnt) { // This won't happen because we just called reset() above. diff --git a/state_trace.go b/state_trace.go index fff0f6bc..e835d8a5 100644 --- a/state_trace.go +++ b/state_trace.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build with_tla +// +build with_tla + package raft import ( @@ -22,6 +25,8 @@ import ( "go.etcd.io/raft/v3/tracker" ) +const EnableStateTrace = true + type stateMachineEventType int const ( @@ -159,20 +164,6 @@ type TraceLogger interface { TraceEvent(*TracingEvent) } -func traceInitStateOnce(r *raft) { - if r.traceLogger == nil { - return - } - - if r.initStateTraced { - return - } - - r.initStateTraced = true - - traceNodeEvent(rsmInitState, r) -} - func traceEvent(evt stateMachineEventType, r *raft, m *raftpb.Message, prop map[string]any) { if r.traceLogger == nil { return @@ -198,6 +189,58 @@ func traceNodeEvent(evt stateMachineEventType, r *raft) { traceEvent(evt, r, nil, nil) } +func formatConf(s []uint64) []string { + if s == nil { + return []string{} + } + + r := make([]string, len(s)) + for i, v := range s { + r[i] = strconv.FormatUint(v, 10) + } + return r +} + +// Use following helper functions to trace specific state and/or +// transition at corresponding code lines +func traceInitStateOnce(r *raft) { + if r.traceLogger == nil { + return + } + + if r.initStateTraced { + return + } + + r.initStateTraced = true + + traceNodeEvent(rsmInitState, r) +} + +func traceReady(r *raft) { + traceNodeEvent(rsmReady, r) +} + +func traceCommit(r *raft) { + traceNodeEvent(rsmCommit, r) +} + +func traceReplicate(r *raft) { + traceNodeEvent(rsmReplicate, r) +} + +func traceBecomeFollower(r *raft) { + traceNodeEvent(rsmBecomeFollower, r) +} + +func traceBecomeCandidate(r *raft) { + traceNodeEvent(rsmBecomeCandidate, r) +} + +func traceBecomeLeader(r *raft) { + traceNodeEvent(rsmBecomeLeader, r) +} + func traceChangeConfEvent(cci raftpb.ConfChangeI, r *raft) { cc2 := cci.AsV2() cc := &TracingConfChange{ @@ -294,15 +337,3 @@ func traceReceiveMessage(r *raft, m *raftpb.Message) { time.Sleep(time.Millisecond) // sleep 1ms to reduce time shift impact accross node traceEvent(evt, r, m, nil) } - -func formatConf(s []uint64) []string { - if s == nil { - return []string{} - } - - r := make([]string, len(s)) - for i, v := range s { - r[i] = strconv.FormatUint(v, 10) - } - return r -} diff --git a/state_trace_nop.go b/state_trace_nop.go new file mode 100644 index 00000000..d94648d5 --- /dev/null +++ b/state_trace_nop.go @@ -0,0 +1,51 @@ +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !with_tla +// +build !with_tla + +package raft + +import ( + "go.etcd.io/raft/v3/raftpb" + "go.etcd.io/raft/v3/tracker" +) + +const EnableStateTrace = false + +type TraceLogger interface{} + +type TracingEvent struct{} + +func traceInitStateOnce(*raft) {} + +func traceReady(*raft) {} + +func traceCommit(*raft) {} + +func traceReplicate(*raft) {} + +func traceBecomeFollower(*raft) {} + +func traceBecomeCandidate(*raft) {} + +func traceBecomeLeader(*raft) {} + +func traceChangeConfEvent(raftpb.ConfChangeI, *raft) {} + +func traceConfChangeEvent(tracker.Config, *raft) {} + +func traceSendMessage(*raft, *raftpb.Message) {} + +func traceReceiveMessage(*raft, *raftpb.Message) {} diff --git a/tla/README.md b/tla/README.md new file mode 100644 index 00000000..9e593703 --- /dev/null +++ b/tla/README.md @@ -0,0 +1,72 @@ +# TLA+ Specification and Trace Validation for Raft Library: A Comprehensive Guide + +This document presents comprehensive guidelines on how to create a TLA+ specification for the Raft library as implemented in `etcd-io/raft`. The distinctive behaviors of this library, such as reconfiguration, set it apart from the original Raft algorithm. A TLA+ specification serves dual purposes: it not only helps verify the correctness of the model but also facilitates model-based trace validation. + +## Checking the Model with TLA+ Specifications + +The TLA+ specifications can be verified using the TLC model checker. Here are a few methods: + +1. **TLA+ Toolbox**: This is an ideal tool for an in-depth study of the specification. For more information, refer to the [TLA+ Toolbox guideline](https://lamport.azurewebsites.net/tla/toolbox.html). + +2. **VSCode Plugin TLA+ Nightly**: This is a viable alternative to the TLA+ Toolbox, particularly for those accustomed to using VSCode. For more information, refer to the [VSCode Plugin TLA+ Nightly guideline](https://github.com/tlaplus/vscode-tlaplus/wiki). + +3. **CLI**: This is the best option for integration with existing test frameworks and automation. For more information, refer to the [CLI guideline](https://learntla.com/topics/cli.html). You can execute a typical command line to verify etcdraft.tla as shown below. Please ensure that tla2tools.jar and CommunityModules-deps.jar have been downloaded and are available in the current folder before proceeding. + + ```console + java -XX:+UseParallelGC -Dtlc2.tool.impl.Tool.cdot=true -cp tla2tools.jar:CommunityModules-deps.jar tlc2.TLC -workers auto MCetcdraft.tla -dumpTrace tla MCetcdraft.trace.tla -dumpTrace json MCetcdraft.json -lncheck final + ``` + +## TLA+ Model-Based Trace Validation + +The correctness of applications based on a consensus algorithm is ensured by two factors: the correctness of the algorithm itself, and the alignment of the implementation with the algorithm. + +The first factor, the correctness of the algorithm, is assured through model checking the specification. The second component, the alignment between the implementation and the algorithm, is fortified through trace validation, which serves to bridge the gap between the model and its implementation. + +To ensure the robustness of trace validation, we initially inject multiple trace log points at specific sections in the code where state transitions occur (for example, SendAppendEntries, BecomeLeader, etcd). The trace validation specification leverages these traces as a state space constraint, guiding the state machine to traverse in a manner identical to that of the service. + +If a trace suggests a state or transition that the state machine can't accommodate, it indicates a discrepancy between the model and its implementation. In cases where the model has already been verified by the TLC model checker, it's more likely that any issues arise from the implementation rather than the model. + +## Enabling and Running TLA+ Trace Validation + +To activate trace collection, build the application using the "with_tla" tag (`go build -tags=with_tla`). The `StartNode` and/or `RestartNode` should be invoked with `Config` and the `TraceLogger` property set to an instance of `TraceLogger` interface. This instance emits tracing events in the correct format. + +To implement an applicaiton specific trace logger + +```go +type TraceLogger interface { + TraceEvent(*TracingEvent) +} + +type MyTraceLogger struct{} + +func (t *MyTraceLogger) TraceEvent(e *TracingEvent) { + // implementation details +} + +``` + +To start a threee-node cluster with trace logger +```go +storage := raft.NewMemoryStorage() + c := &raft.Config{ + ID: 0x01, + ElectionTick: 10, + HeartbeatTick: 1, + Storage: storage, + MaxSizePerMsg: 4096, + MaxInflightMsgs: 256, + TraceLogger: &MyTraceLogger{}, + } + // Set peer list to the other nodes in the cluster. + // Note that they need to be started separately as well. + n := raft.StartNode(c, []raft.Peer{{ID: 0x02}, {ID: 0x03}}) +``` +To preserve the causality of events across nodes, run all application instances on the same machine and store traces in the same file. This approach maintains the order of traces in all instances. + +Once the trace file is ready, compress it (as required by validate.sh) and validate it with the following CLI: + +```console +./validate.sh example.ndjson.gz +``` +**Note**: Raft generates numerous traces. Storing these traces on a ramdisk can significantly mitigate the application's performance impact. +