[RB] Handle retries in the CLI rather than server-side (#8195)

Extracted CLI code from #8172 This PR disables automatic retries when using the remote bazel CLI. The CLI then implements retry behavior itself so it can detect when a retry is happening This fixes a bug where if the remote run fails due to a transient error, the CLI will report a failure. However the scheduler may retry the run in the background and it may eventually succeed. When users go to the invocation link, they'll see that it succeeded despite expecting a failure Slack thread (Issue 2): https://buildbuddy-corp.slack.com/archives/C07GMM2VBLY/p1734595804049229?thread_ts=1734595606.837269&cid=C07GMM2VBLY Should be merged after #8172 is rolled out
buildbuddy-io · Feb 6, 2025 · 923eb4d · 923eb4d
1 parent d9e5f87
commit 923eb4d
Show file tree

Hide file tree

Showing 2 changed files with 121 additions and 77 deletions.
diff --git a/cli/remotebazel/remotebazel.go b/cli/remotebazel/remotebazel.go
@@ -68,6 +68,8 @@ const (
 	// `git remote` output is expected to look like:
 	// `origin	git@github.com:buildbuddy-io/buildbuddy.git (fetch)`
 	gitRemoteRegex = `(.+)\s+(.+)\s+\((push|fetch)\)`
+
+	maxRetries = 5
 )
 
 var (
@@ -89,6 +91,7 @@ var (
 	// Ex. --run_from_snapshot='{"snapshotId":"XXX","instanceName":""}'
 	runFromSnapshot = RemoteFlagset.String("run_from_snapshot", "", "JSON for a snapshot key that the remote runner should be resumed from. If unset, the snapshot key is determined programatically.")
 	script          = RemoteFlagset.String("script", "", "Shell code to run remotely instead of a Bazel command.")
+	disableRetry    = RemoteFlagset.Bool("disable_retry", false, "By default, transient errors are automatically retried. This behavior can be disabled, if a command is non-idempotent for example.")
 )
 
 func consoleCursorMoveUp(y int) {
@@ -885,6 +888,9 @@ func Run(ctx context.Context, opts RunOpts, repoConfig *RepoConfig) (int, error)
 		ExecProperties: platform.Properties,
 		RemoteHeaders:  *remoteHeaders,
 		RunRemotely:    *runRemotely,
+		// In order to detect and notify on retry, this client will implement
+		// retry behavior itself. Direct the server to not retry.
+		DisableRetry: true,
 		Steps: []*rnpb.Step{
 			{
 				Run: opts.Command,
@@ -904,93 +910,34 @@ func Run(ctx context.Context, opts RunOpts, repoConfig *RepoConfig) (int, error)
 	if len(encodedReq) > 0 {
 		log.Debugf("Run request: %s", string(encodedReq))
 	}
-
 	log.Printf("\nWaiting for available remote runner...\n")
-	rsp, err := bbClient.Run(ctx, req)
-	if err != nil {
-		return 1, status.UnknownErrorf("error running bazel: %s", err)
-	}
 
-	iid := rsp.GetInvocationId()
-	log.Debugf("Invocation ID: %s", iid)
+	retry := !*disableRetry
+	retryCount := 0
 
-	// If the remote bazel process is canceled or killed, cancel the remote run
-	isInvocationRunning := true
-	go func() {
-		<-ctx.Done()
+	var inRsp *inpb.GetInvocationResponse
+	var executeResponse *repb.ExecuteResponse
+	var latestErr error
+	for {
+		inRsp, executeResponse, latestErr = attemptRun(ctx, bbClient, execClient, req)
 
-		if !isInvocationRunning {
-			return
+		if len(inRsp.GetInvocation()) > 0 && inRsp.GetInvocation()[0].Success ||
+			!rexec.Retryable(err) ||
+			status.IsDeadlineExceededError(err) ||
+			ctx.Err() != nil {
+			retry = false
 		}
 
-		// Use a non-cancelled context to ensure the remote executions are
-		// canceled
-		_, err = bbClient.CancelExecutions(context.WithoutCancel(ctx), &inpb.CancelExecutionsRequest{
-			InvocationId: iid,
-		})
-		if err != nil {
-			log.Warnf("Failed to cancel remote run: %s", err)
+		if !retry || retryCount >= maxRetries {
+			break
 		}
-	}()
 
-	interactive := terminal.IsTTY(os.Stdin) && terminal.IsTTY(os.Stderr)
-	if interactive {
-		if err := streamLogs(ctx, bbClient, iid); err != nil {
-			return 1, status.WrapError(err, "streaming logs")
-		}
-	} else {
-		if err := printLogs(ctx, bbClient, iid); err != nil {
-			return 1, status.WrapError(err, "streaming logs")
-		}
+		log.Warnf("Remote run failed due to transient error. Retrying: %s", err)
+		retryCount++
 	}
-	isInvocationRunning = false
 
-	eg := errgroup.Group{}
-	var inRsp *inpb.GetInvocationResponse
-	var executeResponse *repb.ExecuteResponse
-	eg.Go(func() error {
-		var err error
-		inRsp, err = bbClient.GetInvocation(ctx, &inpb.GetInvocationRequest{Lookup: &inpb.InvocationLookup{InvocationId: iid}})
-		if err != nil {
-			return fmt.Errorf("could not retrieve invocation: %s", err)
-		}
-		if len(inRsp.GetInvocation()) == 0 {
-			return fmt.Errorf("invocation not found")
-		}
-		return nil
-	})
-	eg.Go(func() error {
-		execution, err := bbClient.GetExecution(ctx, &espb.GetExecutionRequest{ExecutionLookup: &espb.ExecutionLookup{
-			InvocationId: iid,
-		}})
-		if err != nil {
-			return fmt.Errorf("could not retrieve ci_runner execution: %s", err)
-		}
-		if len(execution.GetExecution()) == 0 {
-			return fmt.Errorf("ci_runner execution not found")
-		}
-		executionID := execution.GetExecution()[0].GetExecutionId()
-		waitExecutionStream, err := execClient.WaitExecution(ctx, &repb.WaitExecutionRequest{
-			Name: executionID,
-		})
-		if err != nil {
-			return fmt.Errorf("wait execution: %w", err)
-		}
-		rsp, err := rexec.Wait(rexec.NewRetryingStream(ctx, execClient, waitExecutionStream, executionID))
-		if err != nil {
-			return fmt.Errorf("wait execution: %v", err)
-		} else if rsp.Err != nil {
-			return fmt.Errorf("wait execution: %v", rsp.Err)
-		} else if rsp.ExecuteResponse.GetResult() == nil {
-			return fmt.Errorf("empty execute response from WaitExecution: %v", rsp.ExecuteResponse.GetStatus())
-		}
-		executeResponse = rsp.ExecuteResponse
-		return nil
-	})
-
-	err = eg.Wait()
-	if err != nil {
-		return 1, err
+	if latestErr != nil {
+		return 1, latestErr
 	}
 
 	childIID := ""
@@ -1063,6 +1010,94 @@ func Run(ctx context.Context, opts RunOpts, repoConfig *RepoConfig) (int, error)
 	return exitCode, nil
 }
 
+func attemptRun(ctx context.Context, bbClient bbspb.BuildBuddyServiceClient, execClient repb.ExecutionClient, req *rnpb.RunRequest) (*inpb.GetInvocationResponse, *repb.ExecuteResponse, error) {
+	var inRsp *inpb.GetInvocationResponse
+	var execRsp *repb.ExecuteResponse
+
+	rsp, err := bbClient.Run(ctx, req)
+	if err != nil {
+		return nil, nil, status.UnknownErrorf("error running bazel: %s", err)
+	}
+	iid := rsp.GetInvocationId()
+
+	// If the remote bazel process is canceled or killed, cancel the remote run
+	isInvocationRunning := true
+	defer func() {
+		if !isInvocationRunning {
+			return
+		}
+
+		// Use a non-cancelled context to ensure the remote executions are
+		// canceled
+		_, err = bbClient.CancelExecutions(context.WithoutCancel(ctx), &inpb.CancelExecutionsRequest{
+			InvocationId: iid,
+		})
+		if err != nil {
+			log.Warnf("Failed to cancel remote run: %s", err)
+		}
+	}()
+
+	interactive := terminal.IsTTY(os.Stdin) && terminal.IsTTY(os.Stderr)
+	if interactive {
+		if err := streamLogs(ctx, bbClient, iid); err != nil {
+			return nil, nil, status.WrapError(err, "streaming logs")
+		}
+	} else {
+		if err := printLogs(ctx, bbClient, iid); err != nil {
+			return nil, nil, status.WrapError(err, "streaming logs")
+		}
+	}
+	isInvocationRunning = false
+
+	eg := errgroup.Group{}
+	eg.Go(func() error {
+		var err error
+		inRsp, err = bbClient.GetInvocation(ctx, &inpb.GetInvocationRequest{Lookup: &inpb.InvocationLookup{InvocationId: iid}})
+		if err != nil {
+			return fmt.Errorf("could not retrieve invocation: %s", err)
+		}
+		if len(inRsp.GetInvocation()) == 0 {
+			return fmt.Errorf("invocation not found")
+		}
+		return nil
+	})
+	eg.Go(func() error {
+		execution, err := bbClient.GetExecution(ctx, &espb.GetExecutionRequest{ExecutionLookup: &espb.ExecutionLookup{
+			InvocationId: iid,
+		}})
+		if err != nil {
+			return fmt.Errorf("could not retrieve ci_runner execution: %s", err)
+		}
+		if len(execution.GetExecution()) == 0 {
+			return fmt.Errorf("ci_runner execution not found")
+		}
+		executionID := execution.GetExecution()[0].GetExecutionId()
+		waitExecutionStream, err := execClient.WaitExecution(ctx, &repb.WaitExecutionRequest{
+			Name: executionID,
+		})
+		if err != nil {
+			return fmt.Errorf("wait execution: %w", err)
+		}
+		rsp, err := rexec.Wait(rexec.NewRetryingStream(ctx, execClient, waitExecutionStream, executionID))
+		if err != nil {
+			return fmt.Errorf("wait execution: %v", err)
+		} else if rsp.Err != nil {
+			return fmt.Errorf("wait execution: %v", rsp.Err)
+		} else if rsp.ExecuteResponse.GetResult() == nil {
+			return fmt.Errorf("empty execute response from WaitExecution: %v", rsp.ExecuteResponse.GetStatus())
+		}
+		execRsp = rsp.ExecuteResponse
+		return nil
+	})
+
+	err = eg.Wait()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return inRsp, execRsp, nil
+}
+
 func HandleRemoteBazel(commandLineArgs []string) (int, error) {
 	commandLineArgs, err := parseRemoteCliFlags(commandLineArgs)
 	if err != nil {

diff --git a/docs/remote-bazel-introduction.md b/docs/remote-bazel-introduction.md
@@ -217,6 +217,9 @@ The following configuration options are supported:
   will mirror the local state (including any non-committed local diffs).
 - `--script`: If set, the bash code to run on the remote runner instead of a Bazel command.
   - See `Running bash scripts below` for more details.
+- `--disable_retry`: By default, remote runs are automatically retried on transient
+  errors. If your remote command is not idempotent (such as if you're running
+  a deploy command), you should set this to true to disable retries.
 
 In order to run the CLI with debug logs enabled, you can add `--verbose=1` between
 `bb` and `remote`. Note that this is a different syntax from the rest of the
@@ -270,6 +273,12 @@ curl -d '{
 https://app.buildbuddy.io/api/v1/Run
 ```
 
+### Retry behavior
+
+By default, Remote Bazel runs are assumed to be idempotent and are automatically
+retried on transient errors. If this is not the case and it is important that your
+commands run at most once, you should disable automatic retries with `--disable_retry`.
+
 ### Private GitHub repos
 
 If your GitHub repo is private, you must first link it at https://app.buildbuddy.io/workflows/