-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
Copy pathraft.proto
215 lines (198 loc) · 9.87 KB
/
raft.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
syntax = "proto2";
package raftpb;
option go_package = "github.com/cockroachdb/cockroach/pkg/raft/raftpb";
import "gogoproto/gogo.proto";
option (gogoproto.marshaler_all) = true;
option (gogoproto.sizer_all) = true;
option (gogoproto.unmarshaler_all) = true;
option (gogoproto.goproto_getters_all) = false;
option (gogoproto.goproto_enum_prefix_all) = false;
option (gogoproto.goproto_unkeyed_all) = false;
option (gogoproto.goproto_unrecognized_all) = false;
option (gogoproto.goproto_sizecache_all) = false;
enum EntryType {
EntryNormal = 0;
EntryConfChange = 1; // corresponds to pb.ConfChange
EntryConfChangeV2 = 2; // corresponds to pb.ConfChangeV2
}
message Entry {
optional uint64 Term = 2 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations
optional uint64 Index = 3 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations
optional EntryType Type = 1 [(gogoproto.nullable) = false];
optional bytes Data = 4;
}
message SnapshotMetadata {
optional ConfState conf_state = 1 [(gogoproto.nullable) = false];
optional uint64 index = 2 [(gogoproto.nullable) = false];
optional uint64 term = 3 [(gogoproto.nullable) = false];
}
message Snapshot {
optional bytes data = 1;
optional SnapshotMetadata metadata = 2 [(gogoproto.nullable) = false];
}
// For description of different message types, see:
// https://pkg.go.dev/go.etcd.io/raft/v3#hdr-MessageType
enum MessageType {
MsgHup = 0;
MsgBeat = 1;
MsgProp = 2;
MsgApp = 3;
MsgAppResp = 4;
MsgVote = 5;
MsgVoteResp = 6;
MsgSnap = 7;
MsgHeartbeat = 8;
MsgHeartbeatResp = 9;
MsgUnreachable = 10;
MsgSnapStatus = 11;
MsgCheckQuorum = 12;
MsgTransferLeader = 13;
MsgTimeoutNow = 14;
MsgPreVote = 17;
MsgPreVoteResp = 18;
MsgStorageAppend = 19;
MsgStorageAppendResp = 20;
MsgStorageApply = 21;
MsgStorageApplyResp = 22;
MsgForgetLeader = 23;
// NOTE: when adding new message types, remember to update the isLocalMsg and
// isResponseMsg arrays in raft/util.go and update the corresponding tests in
// raft/util_test.go.
reserved 15, 16; // used to be MsgReadIndex(Resp)
}
message Message {
optional MessageType type = 1 [(gogoproto.nullable) = false];
optional uint64 to = 2 [(gogoproto.nullable) = false];
optional uint64 from = 3 [(gogoproto.nullable) = false];
optional uint64 term = 4 [(gogoproto.nullable) = false];
// logTerm is generally used for appending Raft logs to followers. For example,
// (type=MsgApp,index=100,logTerm=5) means the leader appends entries starting
// at index=101, and the term of the entry at index 100 is 5.
// (type=MsgAppResp,reject=true,index=100,logTerm=5) means follower rejects some
// entries from its leader as it already has an entry with term 5 at index 100.
// (type=MsgStorageAppendResp,index=100,logTerm=5) means the local node wrote
// entries up to index=100 in stable storage, and the term of the entry at index
// 100 was 5. This doesn't always mean that the corresponding MsgStorageAppend
// message was the one that carried these entries, just that those entries were
// stable at the time of processing the corresponding MsgStorageAppend.
optional uint64 logTerm = 5 [(gogoproto.nullable) = false];
optional uint64 index = 6 [(gogoproto.nullable) = false];
repeated Entry entries = 7 [(gogoproto.nullable) = false];
optional uint64 commit = 8 [(gogoproto.nullable) = false];
// (type=MsgStorageAppend,vote=5,term=10) means the local node is voting for
// peer 5 in term 10. For MsgStorageAppends, the term, vote, and commit fields
// will either all be set (to facilitate the construction of a HardState) if
// any of the fields have changed or will all be unset if none of the fields
// have changed.
optional uint64 vote = 13 [(gogoproto.nullable) = false];
// snapshot is non-nil and non-empty for MsgSnap messages and nil for all other
// message types. However, peer nodes running older binary versions may send a
// non-nil, empty value for the snapshot field of non-MsgSnap messages. Code
// should be prepared to handle such messages.
optional Snapshot snapshot = 9 [(gogoproto.nullable) = true];
optional bool reject = 10 [(gogoproto.nullable) = false];
optional uint64 rejectHint = 11 [(gogoproto.nullable) = false];
optional bytes context = 12 [(gogoproto.nullable) = true];
// responses are populated by a raft node to instruct storage threads on how
// to respond and who to respond to when the work associated with a message
// is complete. Populated for MsgStorageAppend and MsgStorageApply messages.
repeated Message responses = 14 [(gogoproto.nullable) = false];
}
message HardState {
optional uint64 term = 1 [(gogoproto.nullable) = false];
optional uint64 vote = 2 [(gogoproto.nullable) = false];
optional uint64 commit = 3 [(gogoproto.nullable) = false];
}
// ConfChangeTransition specifies the behavior of a configuration change with
// respect to joint consensus.
enum ConfChangeTransition {
// Automatically use the simple protocol if possible, otherwise fall back
// to ConfChangeJointImplicit. Most applications will want to use this.
ConfChangeTransitionAuto = 0;
// Use joint consensus unconditionally, and transition out of them
// automatically (by proposing a zero configuration change).
//
// This option is suitable for applications that want to minimize the time
// spent in the joint configuration and do not store the joint configuration
// in the state machine (outside of InitialState).
ConfChangeTransitionJointImplicit = 1;
// Use joint consensus and remain in the joint configuration until the
// application proposes a no-op configuration change. This is suitable for
// applications that want to explicitly control the transitions, for example
// to use a custom payload (via the Context field).
ConfChangeTransitionJointExplicit = 2;
}
message ConfState {
// The voters in the incoming config. (If the configuration is not joint,
// then the outgoing config is empty).
repeated uint64 voters = 1;
// The learners in the incoming config.
repeated uint64 learners = 2;
// The voters in the outgoing config.
repeated uint64 voters_outgoing = 3;
// The nodes that will become learners when the outgoing config is removed.
// These nodes are necessarily currently in nodes_joint (or they would have
// been added to the incoming config right away).
repeated uint64 learners_next = 4;
// If set, the config is joint and Raft will automatically transition into
// the final config (i.e. remove the outgoing config) when this is safe.
optional bool auto_leave = 5 [(gogoproto.nullable) = false];
}
enum ConfChangeType {
ConfChangeAddNode = 0;
ConfChangeRemoveNode = 1;
ConfChangeUpdateNode = 2;
ConfChangeAddLearnerNode = 3;
}
message ConfChange {
optional ConfChangeType type = 2 [(gogoproto.nullable) = false];
optional uint64 node_id = 3 [(gogoproto.nullable) = false, (gogoproto.customname) = "NodeID"];
optional bytes context = 4;
// NB: this is used only by etcd to thread through a unique identifier.
// Ideally it should really use the Context instead. No counterpart to
// this field exists in ConfChangeV2.
optional uint64 id = 1 [(gogoproto.nullable) = false, (gogoproto.customname) = "ID"];
}
// ConfChangeSingle is an individual configuration change operation. Multiple
// such operations can be carried out atomically via a ConfChangeV2.
message ConfChangeSingle {
optional ConfChangeType type = 1 [(gogoproto.nullable) = false];
optional uint64 node_id = 2 [(gogoproto.nullable) = false, (gogoproto.customname) = "NodeID"];
}
// ConfChangeV2 messages initiate configuration changes. They support both the
// simple "one at a time" membership change protocol and full Joint Consensus
// allowing for arbitrary changes in membership.
//
// The supplied context is treated as an opaque payload and can be used to
// attach an action on the state machine to the application of the config change
// proposal. Note that contrary to Joint Consensus as outlined in the Raft
// paper[1], configuration changes become active when they are *applied* to the
// state machine (not when they are appended to the log).
//
// The simple protocol can be used whenever only a single change is made.
//
// Non-simple changes require the use of Joint Consensus, for which two
// configuration changes are run. The first configuration change specifies the
// desired changes and transitions the Raft group into the joint configuration,
// in which quorum requires a majority of both the pre-changes and post-changes
// configuration. Joint Consensus avoids entering fragile intermediate
// configurations that could compromise survivability. For example, without the
// use of Joint Consensus and running across three availability zones with a
// replication factor of three, it is not possible to replace a voter without
// entering an intermediate configuration that does not survive the outage of
// one availability zone.
//
// The provided ConfChangeTransition specifies how (and whether) Joint Consensus
// is used, and assigns the task of leaving the joint configuration either to
// Raft or the application. Leaving the joint configuration is accomplished by
// proposing a ConfChangeV2 with only and optionally the Context field
// populated.
//
// For details on Raft membership changes, see:
//
// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
message ConfChangeV2 {
optional ConfChangeTransition transition = 1 [(gogoproto.nullable) = false];
repeated ConfChangeSingle changes = 2 [(gogoproto.nullable) = false];
optional bytes context = 3;
}