Skip to content

Commit

Permalink
Automatically retry the build if encountered remote cache eviction error
Browse files Browse the repository at this point in the history
  • Loading branch information
coeuvre committed Mar 17, 2023
1 parent b217dd7 commit 470424c
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -261,16 +261,14 @@ public ListenableFuture<Void> prefetchInputs()
(BulkTransferException e) -> {
if (BulkTransferException.allCausedByCacheNotFoundException(e)) {
var code =
executionOptions.useNewExitCodeForLostInputs
(executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnCacheEviction > 0)
? Code.REMOTE_CACHE_EVICTED
: Code.REMOTE_CACHE_FAILED;
throw new EnvironmentalExecException(
e,
FailureDetail.newBuilder()
.setMessage(
"Failed to fetch blobs because they do not exist remotely."
+ " Build without the Bytes does not work if your remote"
+ " cache evicts blobs during builds.")
.setMessage("Failed to fetch blobs because they do not exist remotely.")
.setSpawn(FailureDetails.Spawn.newBuilder().setCode(code))
.build());
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,17 @@ public boolean usingLocalTestJobs() {
+ " blobs during the build.")
public boolean useNewExitCodeForLostInputs;

@Option(
name = "experimental_remote_cache_eviction_retries",
defaultValue = "0",
documentationCategory = OptionDocumentationCategory.REMOTE,
effectTags = {OptionEffectTag.EXECUTION},
help =
"The maximum number of attempts to retry if the build encountered remote cache eviction"
+ " error. A non-zero value will implicitly set"
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs to true.")
public int remoteRetryOnCacheEviction;

/** An enum for specifying different formats of test output. */
public enum TestOutputFormat {
SUMMARY, // Provide summary output only.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,8 @@ private SpawnResult handleError(
catastrophe = true;
} else if (remoteCacheFailed) {
status = Status.REMOTE_CACHE_FAILED;
if (executionOptions.useNewExitCodeForLostInputs) {
if (executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnCacheEviction > 0) {
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_EVICTED;
} else {
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_FAILED;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
import com.google.devtools.build.lib.util.AnsiStrippingOutputStream;
import com.google.devtools.build.lib.util.DebugLoggerConfigurator;
import com.google.devtools.build.lib.util.DetailedExitCode;
import com.google.devtools.build.lib.util.ExitCode;
import com.google.devtools.build.lib.util.InterruptedFailureDetails;
import com.google.devtools.build.lib.util.LoggingUtil;
import com.google.devtools.build.lib.util.Pair;
Expand Down Expand Up @@ -148,6 +149,55 @@ public BlazeCommandResult exec(
Optional<List<Pair<String, String>>> startupOptionsTaggedWithBazelRc,
List<Any> commandExtensions)
throws InterruptedException {
var remoteCacheEvictionRetries = getRemoteCacheEvictionRetries(args, outErr);
while (true) {
var result =
execOnce(
invocationPolicy,
args,
outErr,
lockingMode,
clientDescription,
firstContactTimeMillis,
startupOptionsTaggedWithBazelRc,
commandExtensions);
if (result.getExitCode() == ExitCode.REMOTE_CACHE_EVICTED && remoteCacheEvictionRetries > 0) {
--remoteCacheEvictionRetries;
outErr.printErrLn("Found remote cache eviction error, retrying the build...");
continue;
}
return result;
}
}

private int getRemoteCacheEvictionRetries(List<String> args, OutErr outErr) {
// Since flags are not parsed yet at this point, we manually extract value of the retry flag.
var retryFlagPrefix = "--experimental_remote_cache_eviction_retries=";
for (var arg : args) {
if (arg.startsWith(retryFlagPrefix)) {
try {
return Integer.parseInt(arg.substring(retryFlagPrefix.length()));
} catch (NumberFormatException e) {
outErr.printErrLn(
String.format(
"Failed to parse retry times: %s, remote cache eviction retry is disabled", e));
return 0;
}
}
}
return 0;
}

public BlazeCommandResult execOnce(
InvocationPolicy invocationPolicy,
List<String> args,
OutErr outErr,
LockingMode lockingMode,
String clientDescription,
long firstContactTimeMillis,
Optional<List<Pair<String, String>>> startupOptionsTaggedWithBazelRc,
List<Any> commandExtensions)
throws InterruptedException {
OriginalUnstructuredCommandLineEvent originalCommandLine =
new OriginalUnstructuredCommandLineEvent(args);
Preconditions.checkNotNull(clientDescription);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -474,9 +474,7 @@ public void remoteCacheEvictBlobs_whenPrefetchingInput_exitWithCode39() throws E
// Assert: Exit code is 39
assertThat(error)
.hasMessageThat()
.contains(
"Build without the Bytes does not work if your remote cache evicts blobs"
+ " during builds");
.contains("Failed to fetch blobs because they do not exist remotely");
assertThat(error).hasMessageThat().contains(String.format("%s/%s", hashCode, bytes.length));
assertThat(error.getDetailedExitCode().getExitCode().getNumericExitCode()).isEqualTo(39);
}
Expand Down
58 changes: 58 additions & 0 deletions src/test/shell/bazel/remote/build_without_the_bytes_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1685,4 +1685,62 @@ end_of_record"
expect_log "$expected_result"
}

function test_remote_cache_eviction_when_prefetching_input() {
mkdir -p a

cat > a/BUILD <<'EOF'
genrule(
name = 'foo',
srcs = ['foo.in'],
outs = ['foo.out'],
cmd = 'cat $(SRCS) > $@',
)
genrule(
name = 'bar',
srcs = ['foo.out', 'bar.in'],
outs = ['bar.out'],
cmd = 'cat $(SRCS) > $@',
tags = ['no-remote-exec'],
)
EOF

echo foo > a/foo.in
echo bar > a/bar.in

# Populate remote cache
bazel build \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
//a:bar >& $TEST_log || fail "Failed to build"

bazel clean

# Clean build, foo.out isn't downloaded
bazel build \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
//a:bar >& $TEST_log || fail "Failed to build"

(! [[ -f bazel-bin/a/foo.out ]]) \
|| fail "Expected intermediate output bazel-bin/a/foo.out to not be downloaded"

# Evict blobs from remote cache
stop_worker
start_worker

echo "updated bar" > a/bar.in

# Incremental build triggers remote cache eviction error but Bazel
# automatically retries the build and reruns the generating actions for
# missing blobs
bazel build \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
--experimental_remote_cache_eviction_retries=5 \
//a:bar >& $TEST_log || fail "Failed to build"

expect_log "Found remote cache eviction error, retrying the build..."
}

run_suite "Build without the Bytes tests"

0 comments on commit 470424c

Please sign in to comment.