Skip to content

Commit

Permalink
Propagate status codes from shard failures appropriately (elastic#118016
Browse files Browse the repository at this point in the history
) (elastic#119205)

* Return 502 if the underlying error is `NodeNotConnectedException`

* Traverse through the cause stack trace and check for `NodeNotConnectedException` and undo `status()` override in `NodeDisconnectedException`

* Rewrite `while` condition

* Fix: precommit

* Let status codes propagate rather than walking the stacktrace explicitly

Walking the stacktrace explicitly and looking for a specific error (node
connection-related errors in this case) is a workaround rather than a
proper fix. Instead, let the status codes propagate all the way to the
top so that they can be reported as-is.

* Fix: unused import

* Fix null deref

* Do not map descendants of `ConnectTransportException` to `502`

* Fix: precommit

* Do not account for 4xx status codes

4xx status codes are not likely to appear along with 5xx status codes.
As a result, we do not need to account for them when looking at shard
failures' status codes.

* Remove unnecessary `switch` case

In reference to the previous commit: this case is no longer needed.

* Rewrite code comment

* Address review comments

* [CI] Auto commit changes from spotless

* Update docs/changelog/118016.yaml

---------

Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co>
  • Loading branch information
1 parent 013f363 commit 5e9f907
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 7 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/118016.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 118016
summary: Propagate status codes from shard failures appropriately
area: Search
type: enhancement
issues:
- 118482
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,21 @@ public RestStatus status() {
// on coordinator node. so get the status from cause instead of returning SERVICE_UNAVAILABLE blindly
return getCause() == null ? RestStatus.SERVICE_UNAVAILABLE : ExceptionsHelper.status(getCause());
}
RestStatus status = shardFailures[0].status();
if (shardFailures.length > 1) {
for (int i = 1; i < shardFailures.length; i++) {
if (shardFailures[i].status().getStatus() >= RestStatus.INTERNAL_SERVER_ERROR.getStatus()) {
status = shardFailures[i].status();
}
RestStatus status = null;
for (ShardSearchFailure shardFailure : shardFailures) {
RestStatus shardStatus = shardFailure.status();
int statusCode = shardStatus.getStatus();

// Return if it's an error that can be retried.
// These currently take precedence over other status code(s).
if (statusCode >= 502 && statusCode <= 504) {
return shardStatus;
} else if (statusCode >= 500) {
status = shardStatus;
}
}
return status;

return status == null ? shardFailures[0].status() : status;
}

public ShardSearchFailure[] shardFailures() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.SearchShardTarget;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.transport.NodeDisconnectedException;
import org.elasticsearch.xcontent.ToXContent;
import org.elasticsearch.xcontent.XContent;
import org.elasticsearch.xcontent.XContentParser;
Expand All @@ -31,6 +32,7 @@

import static org.hamcrest.CoreMatchers.hasItem;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;

public class SearchPhaseExecutionExceptionTests extends ESTestCase {

Expand Down Expand Up @@ -168,4 +170,57 @@ public void testPhaseFailureWithSearchShardFailure() {

assertEquals(actual.status(), RestStatus.BAD_REQUEST);
}

public void testOnlyWithCodesThatDoNotRequirePrecedence() {
int pickedIndex = randomIntBetween(0, 1);

// Pick one of these exceptions randomly.
var searchExceptions = new ElasticsearchException[] {
new ElasticsearchException("simulated"),
new NodeDisconnectedException(null, "unused message", "unused action", null) };

// Status codes that map to searchExceptions.
var expectedStatusCodes = new RestStatus[] { RestStatus.INTERNAL_SERVER_ERROR, RestStatus.BAD_GATEWAY };

ShardSearchFailure shardFailure1 = new ShardSearchFailure(
searchExceptions[pickedIndex],
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 1), null)
);

ShardSearchFailure shardFailure2 = new ShardSearchFailure(
searchExceptions[pickedIndex],
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 2), null)
);

SearchPhaseExecutionException ex = new SearchPhaseExecutionException(
"search",
"all shards failed",
new ShardSearchFailure[] { shardFailure1, shardFailure2 }
);

assertThat(ex.status(), is(expectedStatusCodes[pickedIndex]));
}

public void testWithRetriableCodesThatTakePrecedence() {
// Maps to a 500.
ShardSearchFailure shardFailure1 = new ShardSearchFailure(
new ElasticsearchException("simulated"),
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 1), null)
);

// Maps to a 502.
ShardSearchFailure shardFailure2 = new ShardSearchFailure(
new NodeDisconnectedException(null, "unused message", "unused action", null),
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 2), null)
);

SearchPhaseExecutionException ex = new SearchPhaseExecutionException(
"search",
"all shards failed",
new ShardSearchFailure[] { shardFailure1, shardFailure2 }
);

// The 502 takes precedence over 500.
assertThat(ex.status(), is(RestStatus.BAD_GATEWAY));
}
}

0 comments on commit 5e9f907

Please sign in to comment.