Skip to content

Commit 91ac920

Browse files
ahsanbarkatimangalaman93
authored andcommitted
fix(restore): Do not retry restore proposal (#8058)
Do not retry the restore proposal. It can cause issues in the edge case scenarios. Consider the following scenario: 1. alpha-2 gets the restore request (leader is alpha-0) 2. alpha-2 sends the request to alpha-0 (leader). 3. alpha-0 called proposeAndWait which proposed the req (index 24) at time=15:56:10 4. alpha-0 was still waiting for the proposal to be applied and RPC call for `Restore` by alpha-2 got "transport closing error" at time=15:59:08 5. transport closing is a retriable error, so alpha-2 again tried to proposeoOrSend, this time leader was alpha-1, so it sent it to alpha-1 6. alpha-1 proposed the restore request (index 28) at time=15:59:09
1 parent 283fe63 commit 91ac920

File tree

1 file changed

+1
-37
lines changed

1 file changed

+1
-37
lines changed

worker/online_restore.go

+1-37
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222
"path/filepath"
2323
"strings"
2424
"sync"
25-
"time"
2625

2726
"github.com/golang/glog"
2827
"github.com/golang/protobuf/proto"
@@ -149,7 +148,7 @@ func ProcessRestoreRequest(ctx context.Context, req *pb.RestoreRequest, wg *sync
149148
reqCopy.GroupId = gid
150149
wg.Add(1)
151150
go func() {
152-
errCh <- tryRestoreProposal(ctx, reqCopy)
151+
errCh <- proposeRestoreOrSend(ctx, reqCopy)
153152
}()
154153
}
155154

@@ -182,41 +181,6 @@ func proposeRestoreOrSend(ctx context.Context, req *pb.RestoreRequest) error {
182181
return err
183182
}
184183

185-
func retriableRestoreError(err error) bool {
186-
switch {
187-
case err == conn.ErrNoConnection:
188-
// Try to recover from temporary connection issues.
189-
return true
190-
case strings.Contains(err.Error(), "Raft isn't initialized yet"):
191-
// Try to recover if raft has not been initialized.
192-
return true
193-
case strings.Contains(err.Error(), errRestoreProposal):
194-
// Do not try to recover from other errors when sending the proposal.
195-
return false
196-
default:
197-
// Try to recover from other errors (e.g wrong group, waiting for timestamp, etc).
198-
return true
199-
}
200-
}
201-
202-
func tryRestoreProposal(ctx context.Context, req *pb.RestoreRequest) error {
203-
var err error
204-
for i := 0; i < 10; i++ {
205-
err = proposeRestoreOrSend(ctx, req)
206-
if err == nil {
207-
glog.Info("proposeRestoreOrSend done.")
208-
return nil
209-
}
210-
glog.Errorf("Got error while proposeRestoreOrSend: %v, retrying!", err)
211-
if retriableRestoreError(err) {
212-
time.Sleep(time.Second)
213-
continue
214-
}
215-
return err
216-
}
217-
return err
218-
}
219-
220184
// Restore implements the Worker interface.
221185
func (w *grpcWorker) Restore(ctx context.Context, req *pb.RestoreRequest) (*pb.Status, error) {
222186
var emptyRes pb.Status

0 commit comments

Comments
 (0)