-
Notifications
You must be signed in to change notification settings - Fork 25k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Disallow partial results when shard unavailable #45739
Changes from 8 commits
44012c1
f4d63bb
27b5de7
f4fe7e2
6b16c70
a790db8
935f0df
5e35690
c2df1ab
a812b46
ea31de9
0419183
2767e18
9c97b8f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,12 +28,17 @@ | |
import org.elasticsearch.cluster.routing.ShardRouting; | ||
import org.elasticsearch.cluster.routing.ShardRoutingState; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.common.unit.TimeValue; | ||
import org.elasticsearch.index.query.RangeQueryBuilder; | ||
import org.elasticsearch.rest.RestStatus; | ||
import org.elasticsearch.search.SearchService; | ||
import org.elasticsearch.test.ESIntegTestCase; | ||
import org.junit.After; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.concurrent.atomic.AtomicBoolean; | ||
import java.util.function.Supplier; | ||
|
||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; | ||
import static org.hamcrest.Matchers.containsString; | ||
|
@@ -44,7 +49,6 @@ | |
@ESIntegTestCase.ClusterScope(minNumDataNodes = 2) | ||
public class SearchRedStateIndexIT extends ESIntegTestCase { | ||
|
||
|
||
public void testAllowPartialsWithRedState() throws Exception { | ||
final int numShards = cluster().numDataNodes()+2; | ||
buildRedIndex(numShards); | ||
|
@@ -95,6 +99,89 @@ public void testClusterDisallowPartialsWithRedState() throws Exception { | |
assertThat(ex.getDetailedMessage(), containsString("Search rejected due to missing shard")); | ||
} | ||
|
||
public void testDisallowPartialsWithRedStateRecovering() throws Exception { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if it would be simpler to add a unit test in |
||
int docCount = scaledRandomIntBetween(1000, 10000); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need this number of docs for the test ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The test has a tendency to fail more often on larger sizes. I ended up with 10, 1000, since this makes it fail around 1/3 of the runs (but this is likely highly dependent on the hardware). Changed in: c2df1ab |
||
logger.info("Using docCount [{}]", docCount); | ||
buildIndex(cluster().numDataNodes(), 1, docCount); | ||
|
||
AtomicBoolean stop = new AtomicBoolean(); | ||
List<Thread> searchThreads = new ArrayList<>(); | ||
// this is a little extreme, but necessary to make this test fail reasonably often (half the runs on my machine). | ||
for (int i = 0; i < 100; ++i) { | ||
Thread searchThread = new Thread() { | ||
{ | ||
setDaemon(true); | ||
} | ||
|
||
@Override | ||
public void run() { | ||
while (stop.get() == false) { | ||
// todo: the timeouts below should not be necessary, but this test sometimes hangs without them and that is not | ||
// the immediate purpose of the test. | ||
verify(() -> client().prepareSearch("test").setQuery(new RangeQueryBuilder("field1").gte(0)) | ||
.setSize(100).setAllowPartialSearchResults(false).get(TimeValue.timeValueSeconds(10))); | ||
verify(() -> client().prepareSearch("test") | ||
.setSize(100).setAllowPartialSearchResults(false).get(TimeValue.timeValueSeconds(10))); | ||
} | ||
} | ||
|
||
void verify(Supplier<SearchResponse> call) { | ||
try { | ||
SearchResponse response = call.get(); | ||
assertThat(response.getHits().getHits().length, equalTo(100)); | ||
assertThat(response.getHits().getTotalHits().value, equalTo((long) docCount)); | ||
} catch (Exception e) { | ||
// this is OK. | ||
logger.info("Failed with : " + e); | ||
} | ||
} | ||
}; | ||
searchThreads.add(searchThread); | ||
searchThread.start(); | ||
} | ||
try { | ||
Thread restartThread = new Thread() { | ||
{ | ||
setDaemon(true); | ||
} | ||
|
||
@Override | ||
public void run() { | ||
try { | ||
for (int i = 0; i < 5; ++i) { | ||
internalCluster().restartRandomDataNode(); | ||
} | ||
} catch (Exception e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
}; | ||
restartThread.start(); | ||
for (int i = 0; i < 5; ++i) { | ||
internalCluster().restartRandomDataNode(); | ||
} | ||
restartThread.join(30000); | ||
assertFalse(restartThread.isAlive()); | ||
} finally { | ||
stop.set(true); | ||
searchThreads.forEach(thread -> { | ||
try { | ||
thread.join(30000); | ||
if (thread.isAlive()) { | ||
logger.warn("Thread: " + thread + " is still alive"); | ||
// do not continue unless thread terminates to avoid getting other confusing test errors. Please kill me... | ||
thread.join(); | ||
} | ||
} catch (InterruptedException e) { | ||
throw new RuntimeException(e); | ||
} | ||
}); | ||
} | ||
|
||
// hack to ensure all search contexts are removed, seems we risk leaked search contexts when coordinator dies. | ||
client().admin().indices().prepareDelete("test").get(); | ||
} | ||
|
||
private void setClusterDefaultAllowPartialResults(boolean allowPartialResults) { | ||
String key = SearchService.DEFAULT_ALLOW_PARTIAL_SEARCH_RESULTS.getKey(); | ||
|
||
|
@@ -110,28 +197,35 @@ private void setClusterDefaultAllowPartialResults(boolean allowPartialResults) { | |
} | ||
|
||
private void buildRedIndex(int numShards) throws Exception { | ||
assertAcked(prepareCreate("test").setSettings(Settings.builder().put("index.number_of_shards", | ||
numShards).put("index.number_of_replicas", 0))); | ||
buildIndex(numShards, 0, 10); | ||
|
||
stopNodeAndEnsureRed(); | ||
} | ||
|
||
private void buildIndex(int numShards, int numReplicas, int docCount) { | ||
assertAcked(prepareCreate("test").setSettings(Settings.builder() | ||
.put("index.number_of_shards", numShards).put("index.number_of_replicas", numReplicas))); | ||
ensureGreen(); | ||
for (int i = 0; i < 10; i++) { | ||
client().prepareIndex("test", "type1", ""+i).setSource("field1", "value1").get(); | ||
for (int i = 0; i < docCount; i++) { | ||
client().prepareIndex("test", "type1", ""+i).setSource("field1", i).get(); | ||
} | ||
refresh(); | ||
|
||
internalCluster().stopRandomDataNode(); | ||
|
||
} | ||
|
||
private void stopNodeAndEnsureRed() throws Exception { | ||
internalCluster().stopRandomDataNode(); | ||
|
||
client().admin().cluster().prepareHealth().setWaitForStatus(ClusterHealthStatus.RED).get(); | ||
|
||
assertBusy(() -> { | ||
ClusterState clusterState = client().admin().cluster().prepareState().get().getState(); | ||
List<ShardRouting> unassigneds = clusterState.getRoutingTable().shardsWithState(ShardRoutingState.UNASSIGNED); | ||
assertThat(unassigneds.size(), greaterThan(0)); | ||
}); | ||
|
||
}); | ||
} | ||
|
||
@After | ||
public void cleanup() throws Exception { | ||
public void cleanup() { | ||
assertAcked(client().admin().cluster().prepareUpdateSettings() | ||
.setTransientSettings(Settings.builder().putNull(SearchService.DEFAULT_ALLOW_PARTIAL_SEARCH_RESULTS.getKey()))); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We shouldn't have two tests to check if a partial failure occurred. This one could replace the block above and if successfulOps is less than the number of shards and shardSearchFailures.length is 0 we can assume that the failure was a shard unavailable exception ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in c2df1ab