Skip to content

Commit

Permalink
[RB] Handle retries in the CLI rather than server-side (#8195)
Browse files Browse the repository at this point in the history
Extracted CLI code from
#8172

This PR disables automatic retries when using the remote bazel CLI. The
CLI then implements retry behavior itself so it can detect when a retry
is happening

This fixes a bug where if the remote run fails due to a transient error,
the CLI will report a failure. However the scheduler may retry the run
in the background and it may eventually succeed. When users go to the
invocation link, they'll see that it succeeded despite expecting a
failure

Slack thread (Issue 2):
https://buildbuddy-corp.slack.com/archives/C07GMM2VBLY/p1734595804049229?thread_ts=1734595606.837269&cid=C07GMM2VBLY

Should be merged after
#8172 is rolled out
  • Loading branch information
maggie-lou authored Feb 6, 2025
1 parent d9e5f87 commit 923eb4d
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 77 deletions.
189 changes: 112 additions & 77 deletions cli/remotebazel/remotebazel.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ const (
// `git remote` output is expected to look like:
// `origin git@github.com:buildbuddy-io/buildbuddy.git (fetch)`
gitRemoteRegex = `(.+)\s+(.+)\s+\((push|fetch)\)`

maxRetries = 5
)

var (
Expand All @@ -89,6 +91,7 @@ var (
// Ex. --run_from_snapshot='{"snapshotId":"XXX","instanceName":""}'
runFromSnapshot = RemoteFlagset.String("run_from_snapshot", "", "JSON for a snapshot key that the remote runner should be resumed from. If unset, the snapshot key is determined programatically.")
script = RemoteFlagset.String("script", "", "Shell code to run remotely instead of a Bazel command.")
disableRetry = RemoteFlagset.Bool("disable_retry", false, "By default, transient errors are automatically retried. This behavior can be disabled, if a command is non-idempotent for example.")
)

func consoleCursorMoveUp(y int) {
Expand Down Expand Up @@ -885,6 +888,9 @@ func Run(ctx context.Context, opts RunOpts, repoConfig *RepoConfig) (int, error)
ExecProperties: platform.Properties,
RemoteHeaders: *remoteHeaders,
RunRemotely: *runRemotely,
// In order to detect and notify on retry, this client will implement
// retry behavior itself. Direct the server to not retry.
DisableRetry: true,
Steps: []*rnpb.Step{
{
Run: opts.Command,
Expand All @@ -904,93 +910,34 @@ func Run(ctx context.Context, opts RunOpts, repoConfig *RepoConfig) (int, error)
if len(encodedReq) > 0 {
log.Debugf("Run request: %s", string(encodedReq))
}

log.Printf("\nWaiting for available remote runner...\n")
rsp, err := bbClient.Run(ctx, req)
if err != nil {
return 1, status.UnknownErrorf("error running bazel: %s", err)
}

iid := rsp.GetInvocationId()
log.Debugf("Invocation ID: %s", iid)
retry := !*disableRetry
retryCount := 0

// If the remote bazel process is canceled or killed, cancel the remote run
isInvocationRunning := true
go func() {
<-ctx.Done()
var inRsp *inpb.GetInvocationResponse
var executeResponse *repb.ExecuteResponse
var latestErr error
for {
inRsp, executeResponse, latestErr = attemptRun(ctx, bbClient, execClient, req)

if !isInvocationRunning {
return
if len(inRsp.GetInvocation()) > 0 && inRsp.GetInvocation()[0].Success ||
!rexec.Retryable(err) ||
status.IsDeadlineExceededError(err) ||
ctx.Err() != nil {
retry = false
}

// Use a non-cancelled context to ensure the remote executions are
// canceled
_, err = bbClient.CancelExecutions(context.WithoutCancel(ctx), &inpb.CancelExecutionsRequest{
InvocationId: iid,
})
if err != nil {
log.Warnf("Failed to cancel remote run: %s", err)
if !retry || retryCount >= maxRetries {
break
}
}()

interactive := terminal.IsTTY(os.Stdin) && terminal.IsTTY(os.Stderr)
if interactive {
if err := streamLogs(ctx, bbClient, iid); err != nil {
return 1, status.WrapError(err, "streaming logs")
}
} else {
if err := printLogs(ctx, bbClient, iid); err != nil {
return 1, status.WrapError(err, "streaming logs")
}
log.Warnf("Remote run failed due to transient error. Retrying: %s", err)
retryCount++
}
isInvocationRunning = false

eg := errgroup.Group{}
var inRsp *inpb.GetInvocationResponse
var executeResponse *repb.ExecuteResponse
eg.Go(func() error {
var err error
inRsp, err = bbClient.GetInvocation(ctx, &inpb.GetInvocationRequest{Lookup: &inpb.InvocationLookup{InvocationId: iid}})
if err != nil {
return fmt.Errorf("could not retrieve invocation: %s", err)
}
if len(inRsp.GetInvocation()) == 0 {
return fmt.Errorf("invocation not found")
}
return nil
})
eg.Go(func() error {
execution, err := bbClient.GetExecution(ctx, &espb.GetExecutionRequest{ExecutionLookup: &espb.ExecutionLookup{
InvocationId: iid,
}})
if err != nil {
return fmt.Errorf("could not retrieve ci_runner execution: %s", err)
}
if len(execution.GetExecution()) == 0 {
return fmt.Errorf("ci_runner execution not found")
}
executionID := execution.GetExecution()[0].GetExecutionId()
waitExecutionStream, err := execClient.WaitExecution(ctx, &repb.WaitExecutionRequest{
Name: executionID,
})
if err != nil {
return fmt.Errorf("wait execution: %w", err)
}
rsp, err := rexec.Wait(rexec.NewRetryingStream(ctx, execClient, waitExecutionStream, executionID))
if err != nil {
return fmt.Errorf("wait execution: %v", err)
} else if rsp.Err != nil {
return fmt.Errorf("wait execution: %v", rsp.Err)
} else if rsp.ExecuteResponse.GetResult() == nil {
return fmt.Errorf("empty execute response from WaitExecution: %v", rsp.ExecuteResponse.GetStatus())
}
executeResponse = rsp.ExecuteResponse
return nil
})

err = eg.Wait()
if err != nil {
return 1, err
if latestErr != nil {
return 1, latestErr
}

childIID := ""
Expand Down Expand Up @@ -1063,6 +1010,94 @@ func Run(ctx context.Context, opts RunOpts, repoConfig *RepoConfig) (int, error)
return exitCode, nil
}

func attemptRun(ctx context.Context, bbClient bbspb.BuildBuddyServiceClient, execClient repb.ExecutionClient, req *rnpb.RunRequest) (*inpb.GetInvocationResponse, *repb.ExecuteResponse, error) {
var inRsp *inpb.GetInvocationResponse
var execRsp *repb.ExecuteResponse

rsp, err := bbClient.Run(ctx, req)
if err != nil {
return nil, nil, status.UnknownErrorf("error running bazel: %s", err)
}
iid := rsp.GetInvocationId()

// If the remote bazel process is canceled or killed, cancel the remote run
isInvocationRunning := true
defer func() {
if !isInvocationRunning {
return
}

// Use a non-cancelled context to ensure the remote executions are
// canceled
_, err = bbClient.CancelExecutions(context.WithoutCancel(ctx), &inpb.CancelExecutionsRequest{
InvocationId: iid,
})
if err != nil {
log.Warnf("Failed to cancel remote run: %s", err)
}
}()

interactive := terminal.IsTTY(os.Stdin) && terminal.IsTTY(os.Stderr)
if interactive {
if err := streamLogs(ctx, bbClient, iid); err != nil {
return nil, nil, status.WrapError(err, "streaming logs")
}
} else {
if err := printLogs(ctx, bbClient, iid); err != nil {
return nil, nil, status.WrapError(err, "streaming logs")
}
}
isInvocationRunning = false

eg := errgroup.Group{}
eg.Go(func() error {
var err error
inRsp, err = bbClient.GetInvocation(ctx, &inpb.GetInvocationRequest{Lookup: &inpb.InvocationLookup{InvocationId: iid}})
if err != nil {
return fmt.Errorf("could not retrieve invocation: %s", err)
}
if len(inRsp.GetInvocation()) == 0 {
return fmt.Errorf("invocation not found")
}
return nil
})
eg.Go(func() error {
execution, err := bbClient.GetExecution(ctx, &espb.GetExecutionRequest{ExecutionLookup: &espb.ExecutionLookup{
InvocationId: iid,
}})
if err != nil {
return fmt.Errorf("could not retrieve ci_runner execution: %s", err)
}
if len(execution.GetExecution()) == 0 {
return fmt.Errorf("ci_runner execution not found")
}
executionID := execution.GetExecution()[0].GetExecutionId()
waitExecutionStream, err := execClient.WaitExecution(ctx, &repb.WaitExecutionRequest{
Name: executionID,
})
if err != nil {
return fmt.Errorf("wait execution: %w", err)
}
rsp, err := rexec.Wait(rexec.NewRetryingStream(ctx, execClient, waitExecutionStream, executionID))
if err != nil {
return fmt.Errorf("wait execution: %v", err)
} else if rsp.Err != nil {
return fmt.Errorf("wait execution: %v", rsp.Err)
} else if rsp.ExecuteResponse.GetResult() == nil {
return fmt.Errorf("empty execute response from WaitExecution: %v", rsp.ExecuteResponse.GetStatus())
}
execRsp = rsp.ExecuteResponse
return nil
})

err = eg.Wait()
if err != nil {
return nil, nil, err
}

return inRsp, execRsp, nil
}

func HandleRemoteBazel(commandLineArgs []string) (int, error) {
commandLineArgs, err := parseRemoteCliFlags(commandLineArgs)
if err != nil {
Expand Down
9 changes: 9 additions & 0 deletions docs/remote-bazel-introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,9 @@ The following configuration options are supported:
will mirror the local state (including any non-committed local diffs).
- `--script`: If set, the bash code to run on the remote runner instead of a Bazel command.
- See `Running bash scripts below` for more details.
- `--disable_retry`: By default, remote runs are automatically retried on transient
errors. If your remote command is not idempotent (such as if you're running
a deploy command), you should set this to true to disable retries.

In order to run the CLI with debug logs enabled, you can add `--verbose=1` between
`bb` and `remote`. Note that this is a different syntax from the rest of the
Expand Down Expand Up @@ -270,6 +273,12 @@ curl -d '{
https://app.buildbuddy.io/api/v1/Run
```

### Retry behavior

By default, Remote Bazel runs are assumed to be idempotent and are automatically
retried on transient errors. If this is not the case and it is important that your
commands run at most once, you should disable automatic retries with `--disable_retry`.

### Private GitHub repos

If your GitHub repo is private, you must first link it at https://app.buildbuddy.io/workflows/
Expand Down

0 comments on commit 923eb4d

Please sign in to comment.