Skip to content

Commit

Permalink
Fix SnapshotShardStatus Reporting for Failed Shard (elastic#48556)
Browse files Browse the repository at this point in the history
Fixes the shard snapshot status reporting for failed shards
in the corner case of failing the shard because of an exception
thrown in `SnapshotShardsService` and not the repository.
We were missing the update on the `snapshotStatus` instance in
this case which made the transport APIs using this field report
back an incorrect status.
Fixed by moving the failure handling to the `SnapshotShardsService`
for all cases (which also simplifies the code, the ex. wrapping in
the repository was pointless as we only used the ex. trace upstream
anyway).
Also, added an assertion to another test that explicitly checks this
failure situation (ex. in the `SnapshotShardsService`) already.

Closes elastic#48526
  • Loading branch information
original-brownbear committed Oct 30, 2019
1 parent 4204b2c commit d730802
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RateLimiter;
import org.apache.lucene.util.SetOnce;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionRunnable;
Expand Down Expand Up @@ -995,12 +994,6 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
IndexCommit snapshotIndexCommit, IndexShardSnapshotStatus snapshotStatus, ActionListener<Void> listener) {
final ShardId shardId = store.shardId();
final long startTime = threadPool.absoluteTimeInMillis();
final StepListener<Void> snapshotDoneListener = new StepListener<>();
snapshotDoneListener.whenComplete(listener::onResponse, e -> {
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), ExceptionsHelper.detailedMessage(e));
listener.onFailure(e instanceof IndexShardSnapshotFailedException ? (IndexShardSnapshotFailedException) e
: new IndexShardSnapshotFailedException(store.shardId(), e));
});
try {
logger.debug("[{}] [{}] snapshot to [{}] ...", shardId, snapshotId, metadata.name());

Expand Down Expand Up @@ -1137,8 +1130,8 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
snapshotId, shardId), e);
}
snapshotStatus.moveToDone(threadPool.absoluteTimeInMillis());
snapshotDoneListener.onResponse(null);
}, snapshotDoneListener::onFailure);
listener.onResponse(null);
}, listener::onFailure);
if (indexIncrementalFileCount == 0) {
allFilesUploadedListener.onResponse(Collections.emptyList());
return;
Expand Down Expand Up @@ -1181,7 +1174,7 @@ public void onFailure(Exception e) {
});
}
} catch (Exception e) {
snapshotDoneListener.onFailure(e);
listener.onFailure(e);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,10 @@ public void onResponse(final Void aVoid) {

@Override
public void onFailure(Exception e) {
final String failure = ExceptionsHelper.detailedMessage(e);
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
notifyFailedSnapshotShard(snapshot, shardId, ExceptionsHelper.detailedMessage(e));
notifyFailedSnapshotShard(snapshot, shardId, failure);
}
});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1224,6 +1224,12 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
disruption.startDisrupting();
logger.info("--> restarting data node, which should cause primary shards to be failed");
internalCluster().restartNode(dataNode, InternalTestCluster.EMPTY_CALLBACK);

logger.info("--> wait for shard snapshots to show as failed");
assertBusy(() -> assertThat(
client().admin().cluster().prepareSnapshotStatus("test-repo").setSnapshots("test-snap").get().getSnapshots()
.get(0).getShardsStats().getFailedShards(), greaterThanOrEqualTo(1)), 60L, TimeUnit.SECONDS);

unblockNode("test-repo", dataNode);
disruption.stopDisrupting();
// check that snapshot completes
Expand Down

0 comments on commit d730802

Please sign in to comment.