Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix query analyzer logic for mixed conjunctions of terms and ranges #49803

Merged
merged 6 commits into from
Dec 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ private static Result handleConjunction(List<Result> conjunctionsWithUnknowns) {
List<Result> conjunctions = conjunctionsWithUnknowns.stream().filter(r -> r.isUnknown() == false).collect(Collectors.toList());
if (conjunctions.isEmpty()) {
if (conjunctionsWithUnknowns.isEmpty()) {
throw new IllegalArgumentException("Must have at least on conjunction sub result");
throw new IllegalArgumentException("Must have at least one conjunction sub result");
}
return conjunctionsWithUnknowns.get(0); // all conjunctions are unknown, so just return the first one
}
Expand All @@ -247,47 +247,53 @@ private static Result handleConjunction(List<Result> conjunctionsWithUnknowns) {
int msm = 0;
boolean verified = conjunctionsWithUnknowns.size() == conjunctions.size();
boolean matchAllDocs = true;
boolean hasDuplicateTerms = false;
Set<QueryExtraction> extractions = new HashSet<>();
Set<String> seenRangeFields = new HashSet<>();
for (Result result : conjunctions) {
// In case that there are duplicate query extractions we need to be careful with
// incrementing msm,
// because that could lead to valid matches not becoming candidate matches:
// query: (field:val1 AND field:val2) AND (field:val2 AND field:val3)
// doc: field: val1 val2 val3
// So lets be protective and decrease the msm:

int resultMsm = result.minimumShouldMatch;
for (QueryExtraction queryExtraction : result.extractions) {
if (queryExtraction.range != null) {
// In case of range queries each extraction does not simply increment the
// minimum_should_match
// for that percolator query like for a term based extraction, so that can lead
// to more false
// positives for percolator queries with range queries than term based queries.
// The is because the way number fields are extracted from the document to be
// percolated.
// Per field a single range is extracted and if a percolator query has two or
// more range queries
// on the same field, then the minimum should match can be higher than clauses
// in the CoveringQuery.
// Therefore right now the minimum should match is incremented once per number
// field when processing
// the percolator query at index time.
if (seenRangeFields.add(queryExtraction.range.fieldName)) {
resultMsm = 1;
} else {
resultMsm = 0;
// minimum_should_match for that percolator query like for a term based extraction,
// so that can lead to more false positives for percolator queries with range queries
// than term based queries.
// This is because the way number fields are extracted from the document to be
// percolated. Per field a single range is extracted and if a percolator query has two or
// more range queries on the same field, then the minimum should match can be higher than clauses
// in the CoveringQuery. Therefore right now the minimum should match is only incremented once per
// number field when processing the percolator query at index time.
// For multiple ranges within a single extraction (ie from an existing conjunction or disjunction)
// then this will already have been taken care of, so we only check against fieldnames from
// previously processed extractions, and don't add to the seenRangeFields list until all
// extractions from this result are processed
if (seenRangeFields.contains(queryExtraction.range.fieldName)) {
resultMsm = Math.max(0, resultMsm - 1);
verified = false;
}
}

if (extractions.contains(queryExtraction)) {
resultMsm = Math.max(0, resultMsm - 1);
verified = false;
else {
// In case that there are duplicate term query extractions we need to be careful with
// incrementing msm, because that could lead to valid matches not becoming candidate matches:
// query: (field:val1 AND field:val2) AND (field:val2 AND field:val3)
// doc: field: val1 val2 val3
// So lets be protective and decrease the msm:
if (extractions.contains(queryExtraction)) {
resultMsm = Math.max(0, resultMsm - 1);
verified = false;
}
}
}
msm += resultMsm;

// add range fields from this Result to the seenRangeFields set so that minimumShouldMatch is correctly
// calculated for subsequent Results
result.extractions.stream()
.map(e -> e.range)
.filter(Objects::nonNull)
.map(e -> e.fieldName)
.forEach(seenRangeFields::add);

if (result.verified == false
// If some inner extractions are optional, the result can't be verified
|| result.minimumShouldMatch < result.extractions.size()) {
Expand All @@ -299,7 +305,7 @@ private static Result handleConjunction(List<Result> conjunctionsWithUnknowns) {
if (matchAllDocs) {
return new Result(matchAllDocs, verified);
} else {
return new Result(verified, extractions, hasDuplicateTerms ? 1 : msm);
return new Result(verified, extractions, msm);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
import static org.elasticsearch.percolator.QueryAnalyzer.analyze;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.collection.IsCollectionWithSize.hasSize;

public class QueryAnalyzerTests extends ESTestCase {

Expand Down Expand Up @@ -1208,4 +1209,135 @@ public void testIntervalQueries() {
assertTermsEqual(result.extractions, new Term("field", "a"));
}

public void testCombinedRangeAndTermWithMinimumShouldMatch() {

Query disj = new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 0, 10), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.setMinimumNumberShouldMatch(2)
.build();

Result r = analyze(disj, Version.CURRENT);
assertThat(r.minimumShouldMatch, equalTo(1));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you count the number of extractions and check verified for all queries?

assertThat(r.extractions, hasSize(2));
assertFalse(r.matchAllDocs);
assertFalse(r.verified);

Query q = new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 0, 10), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.FILTER)
.setMinimumNumberShouldMatch(2)
.build();

Result result = analyze(q, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(1));
assertThat(result.extractions.size(), equalTo(2));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);

q = new BooleanQuery.Builder()
.add(q, Occur.MUST)
.add(q, Occur.MUST)
.build();

result = analyze(q, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(1));
assertThat(result.extractions.size(), equalTo(2));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);

Query q2 = new BooleanQuery.Builder()
.add(new TermQuery(new Term("f", "v1")), Occur.FILTER)
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v2")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
.setMinimumNumberShouldMatch(1)
.build();

result = analyze(q2, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(2));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's maybe also have a test for the case that there are multiple range queries on the same field, or multiple range queries on different fields?

assertThat(result.extractions, hasSize(3));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);

// multiple range queries on different fields
Query q3 = new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD)
.add(IntPoint.newRangeQuery("i2", 15, 20), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
.setMinimumNumberShouldMatch(1)
.build();
result = analyze(q3, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(2));
assertThat(result.extractions, hasSize(4));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);

// multiple disjoint range queries on the same field
Query q4 = new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD)
.add(IntPoint.newRangeQuery("i", 25, 30), Occur.SHOULD)
.add(IntPoint.newRangeQuery("i", 35, 40), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
.setMinimumNumberShouldMatch(1)
.build();
result = analyze(q4, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(2));
assertThat(result.extractions, hasSize(5));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);

// multiple conjunction range queries on the same field
Query q5 = new BooleanQuery.Builder()
.add(new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.MUST)
.add(IntPoint.newRangeQuery("i", 25, 30), Occur.MUST)
.build(), Occur.MUST)
.add(IntPoint.newRangeQuery("i", 35, 40), Occur.MUST)
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
.build();
result = analyze(q5, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(2));
assertThat(result.extractions, hasSize(4));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);

// multiple conjunction range queries on different fields
Query q6 = new BooleanQuery.Builder()
.add(new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.MUST)
.add(IntPoint.newRangeQuery("i2", 25, 30), Occur.MUST)
.build(), Occur.MUST)
.add(IntPoint.newRangeQuery("i", 35, 40), Occur.MUST)
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
.build();
result = analyze(q6, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(3));
assertThat(result.extractions, hasSize(4));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);

// mixed term and range conjunctions
Query q7 = new BooleanQuery.Builder()
.add(new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 1, 2), Occur.MUST)
.add(new TermQuery(new Term("f", "1")), Occur.MUST)
.build(), Occur.MUST)
.add(new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 1, 2), Occur.MUST)
.add(new TermQuery(new Term("f", "2")), Occur.MUST)
.build(), Occur.MUST)
.build();
result = analyze(q7, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(3));
assertThat(result.extractions, hasSize(3));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);
}

}