Skip to content

Commit

Permalink
1. Add README.md
Browse files Browse the repository at this point in the history
2. Isolate tracing code from production

Signed-off-by: Joshua Zhang <joshuazh@microsoft.com>
  • Loading branch information
joshuazh-x committed Jan 17, 2024
1 parent b7eb424 commit 7f35117
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 32 deletions.
2 changes: 1 addition & 1 deletion node.go
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ func (n *node) run() {
}
readyc = nil

traceNodeEvent(rsmReady, r)
traceReady(r)
case <-advancec:
n.rn.Advance(rd)
rd = Ready{}
Expand Down
10 changes: 5 additions & 5 deletions raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -762,7 +762,7 @@ func (r *raft) appliedSnap(snap *pb.Snapshot) {
// the commit index changed (in which case the caller should call
// r.bcastAppend).
func (r *raft) maybeCommit() bool {
defer traceNodeEvent(rsmCommit, r)
defer traceCommit(r)

mci := r.prs.Committed()
return r.raftLog.maybeCommit(mci, r.Term)
Expand Down Expand Up @@ -805,7 +805,7 @@ func (r *raft) appendEntry(es ...pb.Entry) (accepted bool) {
es[i].Term = r.Term
es[i].Index = li + 1 + uint64(i)
if es[i].Type == pb.EntryNormal {
traceNodeEvent(rsmReplicate, r)
traceReplicate(r)
}
}
// Track the size of this uncommitted proposal.
Expand Down Expand Up @@ -883,7 +883,7 @@ func (r *raft) becomeFollower(term uint64, lead uint64) {
r.state = StateFollower
r.logger.Infof("%x became follower at term %d", r.id, r.Term)

traceNodeEvent(rsmBecomeFollower, r)
traceBecomeFollower(r)
}

func (r *raft) becomeCandidate() {
Expand All @@ -898,7 +898,7 @@ func (r *raft) becomeCandidate() {
r.state = StateCandidate
r.logger.Infof("%x became candidate at term %d", r.id, r.Term)

traceNodeEvent(rsmBecomeCandidate, r)
traceBecomeCandidate(r)
}

func (r *raft) becomePreCandidate() {
Expand Down Expand Up @@ -945,7 +945,7 @@ func (r *raft) becomeLeader() {
// could be expensive.
r.pendingConfIndex = r.raftLog.lastIndex()

traceNodeEvent(rsmBecomeLeader, r)
traceBecomeLeader(r)
emptyEnt := pb.Entry{Data: nil}
if !r.appendEntry(emptyEnt) {
// This won't happen because we just called reset() above.
Expand Down
83 changes: 57 additions & 26 deletions state_trace.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build with_tla
// +build with_tla

package raft

import (
Expand All @@ -22,6 +25,8 @@ import (
"go.etcd.io/raft/v3/tracker"
)

const EnableStateTrace = true

type stateMachineEventType int

const (
Expand Down Expand Up @@ -159,20 +164,6 @@ type TraceLogger interface {
TraceEvent(*TracingEvent)
}

func traceInitStateOnce(r *raft) {
if r.traceLogger == nil {
return
}

if r.initStateTraced {
return
}

r.initStateTraced = true

traceNodeEvent(rsmInitState, r)
}

func traceEvent(evt stateMachineEventType, r *raft, m *raftpb.Message, prop map[string]any) {
if r.traceLogger == nil {
return
Expand All @@ -198,6 +189,58 @@ func traceNodeEvent(evt stateMachineEventType, r *raft) {
traceEvent(evt, r, nil, nil)
}

func formatConf(s []uint64) []string {
if s == nil {
return []string{}
}

r := make([]string, len(s))
for i, v := range s {
r[i] = strconv.FormatUint(v, 10)
}
return r
}

// Use following helper functions to trace specific state and/or
// transition at corresponding code lines
func traceInitStateOnce(r *raft) {
if r.traceLogger == nil {
return
}

if r.initStateTraced {
return
}

r.initStateTraced = true

traceNodeEvent(rsmInitState, r)
}

func traceReady(r *raft) {
traceNodeEvent(rsmReady, r)
}

func traceCommit(r *raft) {
traceNodeEvent(rsmCommit, r)
}

func traceReplicate(r *raft) {
traceNodeEvent(rsmReplicate, r)
}

func traceBecomeFollower(r *raft) {
traceNodeEvent(rsmBecomeFollower, r)
}

func traceBecomeCandidate(r *raft) {
traceNodeEvent(rsmBecomeCandidate, r)
}

func traceBecomeLeader(r *raft) {
traceNodeEvent(rsmBecomeLeader, r)
}

func traceChangeConfEvent(cci raftpb.ConfChangeI, r *raft) {
cc2 := cci.AsV2()
cc := &TracingConfChange{
Expand Down Expand Up @@ -294,15 +337,3 @@ func traceReceiveMessage(r *raft, m *raftpb.Message) {
time.Sleep(time.Millisecond) // sleep 1ms to reduce time shift impact accross node
traceEvent(evt, r, m, nil)
}

func formatConf(s []uint64) []string {
if s == nil {
return []string{}
}

r := make([]string, len(s))
for i, v := range s {
r[i] = strconv.FormatUint(v, 10)
}
return r
}
51 changes: 51 additions & 0 deletions state_trace_nop.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !with_tla
// +build !with_tla

package raft

import (
"go.etcd.io/raft/v3/raftpb"
"go.etcd.io/raft/v3/tracker"
)

const EnableStateTrace = false

type TraceLogger interface{}

type TracingEvent struct{}

func traceInitStateOnce(*raft) {}

func traceReady(*raft) {}

func traceCommit(*raft) {}

func traceReplicate(*raft) {}

func traceBecomeFollower(*raft) {}

func traceBecomeCandidate(*raft) {}

func traceBecomeLeader(*raft) {}

func traceChangeConfEvent(raftpb.ConfChangeI, *raft) {}

func traceConfChangeEvent(tracker.Config, *raft) {}

func traceSendMessage(*raft, *raftpb.Message) {}

func traceReceiveMessage(*raft, *raftpb.Message) {}
72 changes: 72 additions & 0 deletions tla/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# TLA+ Specification and Trace Validation for Raft Library: A Comprehensive Guide

This document presents comprehensive guidelines on how to create a TLA+ specification for the Raft library as implemented in `etcd-io/raft`. The distinctive behaviors of this library, such as reconfiguration, set it apart from the original Raft algorithm. A TLA+ specification serves dual purposes: it not only helps verify the correctness of the model but also facilitates model-based trace validation.

## Checking the Model with TLA+ Specifications

The TLA+ specifications can be verified using the TLC model checker. Here are a few methods:

1. **TLA+ Toolbox**: This is an ideal tool for an in-depth study of the specification. For more information, refer to the [TLA+ Toolbox guideline](https://lamport.azurewebsites.net/tla/toolbox.html).

2. **VSCode Plugin TLA+ Nightly**: This is a viable alternative to the TLA+ Toolbox, particularly for those accustomed to using VSCode. For more information, refer to the [VSCode Plugin TLA+ Nightly guideline](https://github.com/tlaplus/vscode-tlaplus/wiki).

3. **CLI**: This is the best option for integration with existing test frameworks and automation. For more information, refer to the [CLI guideline](https://learntla.com/topics/cli.html). You can execute a typical command line to verify etcdraft.tla as shown below. Please ensure that tla2tools.jar and CommunityModules-deps.jar have been downloaded and are available in the current folder before proceeding.

```console
java -XX:+UseParallelGC -Dtlc2.tool.impl.Tool.cdot=true -cp tla2tools.jar:CommunityModules-deps.jar tlc2.TLC -workers auto MCetcdraft.tla -dumpTrace tla MCetcdraft.trace.tla -dumpTrace json MCetcdraft.json -lncheck final
```

## TLA+ Model-Based Trace Validation

The correctness of applications based on a consensus algorithm is ensured by two factors: the correctness of the algorithm itself, and the alignment of the implementation with the algorithm.

The first factor, the correctness of the algorithm, is assured through model checking the specification. The second component, the alignment between the implementation and the algorithm, is fortified through trace validation, which serves to bridge the gap between the model and its implementation.

To ensure the robustness of trace validation, we initially inject multiple trace log points at specific sections in the code where state transitions occur (for example, SendAppendEntries, BecomeLeader, etcd). The trace validation specification leverages these traces as a state space constraint, guiding the state machine to traverse in a manner identical to that of the service.

If a trace suggests a state or transition that the state machine can't accommodate, it indicates a discrepancy between the model and its implementation. In cases where the model has already been verified by the TLC model checker, it's more likely that any issues arise from the implementation rather than the model.

## Enabling and Running TLA+ Trace Validation

To activate trace collection, build the application using the "with_tla" tag (`go build -tags=with_tla`). The `StartNode` and/or `RestartNode` should be invoked with `Config` and the `TraceLogger` property set to an instance of `TraceLogger` interface. This instance emits tracing events in the correct format.

To implement an applicaiton specific trace logger

```go
type TraceLogger interface {
TraceEvent(*TracingEvent)
}

type MyTraceLogger struct{}

func (t *MyTraceLogger) TraceEvent(e *TracingEvent) {
// implementation details
}

```

To start a threee-node cluster with trace logger
```go
storage := raft.NewMemoryStorage()
c := &raft.Config{
ID: 0x01,
ElectionTick: 10,
HeartbeatTick: 1,
Storage: storage,
MaxSizePerMsg: 4096,
MaxInflightMsgs: 256,
TraceLogger: &MyTraceLogger{},
}
// Set peer list to the other nodes in the cluster.
// Note that they need to be started separately as well.
n := raft.StartNode(c, []raft.Peer{{ID: 0x02}, {ID: 0x03}})
```
To preserve the causality of events across nodes, run all application instances on the same machine and store traces in the same file. This approach maintains the order of traces in all instances.

Once the trace file is ready, compress it (as required by validate.sh) and validate it with the following CLI:

```console
./validate.sh example.ndjson.gz
```
**Note**: Raft generates numerous traces. Storing these traces on a ramdisk can significantly mitigate the application's performance impact.

0 comments on commit 7f35117

Please sign in to comment.