diff --git a/Documentation/rfc/v3api.md b/Documentation/rfc/v3api.md new file mode 100644 index 00000000000..21a3d52bdcb --- /dev/null +++ b/Documentation/rfc/v3api.md @@ -0,0 +1,459 @@ +## Design + +1. Flatten binary key-value space + +2. Keep the event history until compaction + - access to old version of keys + - user controlled history compaction + +3. Support range query + - Pagination support with limit argument + +4. Replace TTL key with Lease + - more efficient/ low cost keep alive + - a logical group of TTL keys + +5. Replace CAS/CAD with multi-object Tnx + - MUCH MORE powerful and flexible + +6. Support efficient watching with multiple ranges + +7. RPC API supports the completed set of APIs. + - more efficient than JSON/HTTP + - additional tnx/lease support + +8. HTTP API supports a subset of APIs. + - easy for people to try out etcd + - easy for people to write simple etcd application + + +## Protobuf Defined API + +``` protobuf +syntax = "proto3"; + +// Interface exported by the server. +service etcd { + // Range gets the keys in the range from the store. + // Maybe rename it to GET? + rpc Range(RangeRequest) returns (RangeResponse) {} + + // Put puts the given key into the store. + // A put request increases the index of the store, + // and generates one event in the event history. + rpc Put(PutRequest) returns (PutResponse) {} + + // Delete deletes the given key from the store. + // A delete request increase the index of the store, + // and generates one event in the event history. + rpc Delete(DeleteRequest) returns (DeleteResponse) {} + + // Tnx processes all the requests in one transaction. + // A tnx request increases the index of the store, + // and generates one event in the event history. + rpc Tnx(TnxRequest) returns (TnxResponse) {} + + // Watch watches the events happening or happened in etcd. Both input and output + // are stream. One watch rpc can watch for multiple ranges and get a stream of + // events. The whole events history can be watched unless compacted. + rpc Watch(stream WatchRequest) returns (stream WatchResponse) {} + + // Compact compacts the event history in etcd. User should compact the + // event history periodically, or it will grow infinitely. + rpc Compact(CompactRequest) returns (CompactResponse) {} + + // LeaseCreate creates a lease. A lease has a TTL. The lease will expire if the + // server does not receive a keepAlive within TTL from the lease holder. + // All keys attached to the lease will be expired and deleted if the lease expires. + // The key expiration generates an event in event history. + rpc LeaseCreate(LeaseCreateRequest) returns (LeaseCreateResponse) {} + + // LeaseRevoke revokes a lease. All the key attached to the lease will be expired and deleted. + rpc LeaseRevoke(LeaseRevokeRequest) returns (LeaseRevokeResponse) {} + + // LeaseAttach attaches keys with a lease. + rpc LeaseAttach(LeaseAttachRequest) returns (LeaseAttachResponse) {} + + // LeaseTnx likes Tnx. It does one additional thing that all the keys put into the + // store are attached with the given lease. This is rpc is useful when you want to + // put a key and attach it to a lease atomically. + rpc LeaseTnx(LeaseTnxRequest) returns (LeaseTnxResponse) {} + + // KeepAlive keeps the lease alive. + rpc LeaseKeepAlive(stream LeaseKeepAliveRequest) returns (stream LeaseKeepAliveResponse) {} +} + +message RequestHeader { + // the cluster id to check with + // if the cluster id is not matched, an error is returned. + optional uint64 cluster_id = 1; +} + +message ResponseHeader { + // an error type message? + optional string error = 1; + optional uint64 cluster_id = 2; + optional uint64 member_id = 3; + // index of the store when the requested was processed. + optional int64 index = 4; + optional uint64 raft_term = 5; +} + +message RangeRequest { + optional bytes key = 1; + // if the end_key is not given, it is a get. + // if the end_key is given, it gets the keys in range [key, end_key). + optional bytes end_key = 2; + // limit the number of keys returned. + optional int64 limit = 3; +} + +message RangeResponse { + optional ResponseHeader header = 1; + repeated KeyValue kvs = 2; +} + +message PutRequest { + optional bytes key = 1; + optional bytes value = 2; +} + +message PutResponse { + optional ResponseHeader header = 1; +} + +message DeleteRequest { + optional bytes key = 1; +} + +message DeleteResponse { + optional ResponseHeader header = 1; +} + +message RequestUnion { + oneof value { + RangeRequest request_range = 1; + PutRequest request_put = 2; + DeleteRequest request_delete = 3; + } +} + +message ResponseUnion { + oneof value { + RangeResponse request_range = 1; + PutResponse request_put = 2; + DeleteResponse request_delete = 3; + } +} + +message Compare { + enum CompareType { + equal = 0; + greater = 1; + less = 2; + } + optional CompareType type = 1; + // key path + optional bytes key = 2; + one of { + // version of the given key + int64 version = 1; + // create index of the given key + int64 create_index = 2; + // last modified index of the given key + int64 mod_index = 3; + // value of the given key + bytes value = 4; + } +} + +// First all the compare requests are processed. +// If all the compare succeed, all the success +// requests will be processed. +// Or all the failure requests will be processed and +// all the errors in the comparison will be returned. + +// From google paxosdb paper: +// Our implementation hinges around a powerful primitive which we call MultiOp. All other database +// operations except for iteration are implemented as a single call to MultiOp. A MultiOp is applied atomically +// and consists of three components: +// 1. A list of tests called guard. Each test in guard checks a single entry in the database. It may check +// for the absence or presence of a value, or compare with a given value. Two different tests in the guard +// may apply to the same or different entries in the database. All tests in the guard are applied and +// MultiOp returns the results. If all tests are true, MultiOp executes t op (see item 2 below), otherwise +// it executes f op (see item 3 below). +// 2. A list of database operations called t op. Each operation in the list is either an insert, delete, or +// lookup operation, and applies to a single database entry. Two different operations in the list may apply +// to the same or different entries in the database. These operations are executed +// if guard evaluates to +// true. +// 3. A list of database operations called f op. Like t op, but executed if guard evaluates to false. +message TnxRequest { + repeated Compare compare = 1; + repeated RequestUnion success = 2; + repeated RequestUnion failure = 3; +} + +message TnxResponse { + optional ResponseHeader header = 1; + optional bool succeeded = 2; + repeated ResponseUnion responses = 3; +} + +message KeyValue { + optional bytes key = 1; + // mod_index is the last modified index of the key. + optional int64 create_index = 2 + optional int64 mod_index = 3; + // version is the version of the key. A deletion resets + // the version to zero and any modification of the key + // increases its version. + optional int64 version = 4; + optional bytes value = 5; +} + +message WatchRequest { + optional bytes key = 1; + optional bytes end_key = 2; + // including + optional int64 start_index = 3; + // excluding + optional int64 end_index = 4; + // + optional bool progress_notification = 5; +} + +message WatchResponse { + optional ResponseHeader header = 1; + optional Event event = 2; +} + +message Event { + repeated Action actions = 1; +} + +message Action { + enum ActionType { + put = 0; + delete = 1; + expire = 2; + } + optional ActionType event_type = 1; + optional KeyValue kv = 2; +} + +message CompactRequest { + optional int64 index = 1; +} + +message CompactResponse { + optional ResponseHeader header = 1; +} + +message LeaseCreateRequest { + // advisory ttl in seconds + optional int64 ttl = 1; +} + +message LeaseCreateResponse { + optional int64 lease_id = 1; + // server decided ttl in second + optional int64 ttl = 2; + optional string error = 3; +} + +message LeaseRevokeRequest { + optional int64 lease_id = 1; +} + +message LeaseRevokeResponse { + optional string error = 1; +} + +message LeaseTnxRequest { + optional int64 lease_id = 1; + optional TnxRequest request = 2; +} + +message LeaseTnxResponse { + optional TnxResponse response = 1; + optional string error = 2; +} + +message LeaseAttachRequest { + optional int64 lease_id = 1; + optional bytes key = 2; +} + +message LeaseAttachResponse { + optional string error = 1; +} + +message LeaseKeepAliveRequest { + optional int64 lease_id = 1; +} + +message LeaseKeepAliveResponse { + optional int64 lease_id = 1; + optional int64 ttl = 2; + optional string error = 3; +} +``` + +### Examples + +#### Put a key (foo=bar) +``` +// A put is always successful +Put( PutRequest { key = foo, value = bar } ) + +PutResponse { + cluster_id = 0x1000, + member_id = 0x1, + index = 1, + raft_term = 0x1, +} +``` + +#### Get a key (assume we have foo=bar) +``` +Get ( RangeRequest { key = foo } ) + +RangeResponse { + cluster_id = 0x1000, + member_id = 0x1, + index = 1, + raft_term = 0x1, + kvs = { + { + key = foo, + value = bar, + create_index = 1, + mod_index = 1, + version = 1; + }, + }, +} +``` + +#### Range over a key space (assume we have foo0=bar0… foo100=bar100) +``` +Range ( RangeRequest { key = foo, end_key = foo80, limit = 30 } ) + +RangeResponse { + cluster_id = 0x1000, + member_id = 0x1, + index = 100, + raft_term = 0x1, + kvs = { + { + key = foo0, + value = bar0, + create_index = 1, + mod_index = 1, + version = 1; + }, + ..., + { + key = foo30, + value = bar30, + create_index = 30, + mod_index = 30, + version = 1; + }, + }, +} +``` + +#### Finish a tnx (assume we have foo0=bar0, foo1=bar1) +``` +Tnx(TnxRequest { + // mod_index of foo0 is equal to 1, mod_index of foo1 is greater than 1 + compare = { + {compareType = equal, key = foo0, mod_index = 1}, + {compareType = greater, key = foo1, mod_index = 1}} + }, + // if the comparison succeeds, put foo2 = bar2 + success = {PutRequest { key = foo2, value = success }}, + // if the comparison fails, put foo2=fail + failure = {PutRequest { key = foo2, value = failure }}, +) + +TnxResponse { + cluster_id = 0x1000, + member_id = 0x1, + index = 3, + raft_term = 0x1, + succeeded = true, + responses = { + // response of PUT foo2=success + { + cluster_id = 0x1000, + member_id = 0x1, + index = 3, + raft_term = 0x1, + } + } +} +``` + +#### Watch on a key/range + +``` +Watch( WatchRequest{ + key = foo, + end_key = fop, // prefix foo + start_index = 20, + end_index = 10000, + // server decided notification frequency + progress_notification = true, + } + … // this can be a watch request stream + ) + +// put (foo0=bar0) event at 3 +WatchResponse { + cluster_id = 0x1000, + member_id = 0x1, + index = 3, + raft_term = 0x1, + event_type = put, + kv = { + key = foo0, + value = bar0, + create_index = 1, + mod_index = 1, + version = 1; + }, + } + … + + // a notification at 2000 + WatchResponse { + cluster_id = 0x1000, + member_id = 0x1, + index = 2000, + raft_term = 0x1, + // nil event as notification + } + + … + + // put (foo0=bar3000) event at 3000 + WatchResponse { + cluster_id = 0x1000, + member_id = 0x1, + index = 3000, + raft_term = 0x1, + event_type = put, + kv = { + key = foo0, + value = bar3000, + create_index = 1, + mod_index = 3000, + version = 2; + }, + } + … + +``` \ No newline at end of file