From 4e31ea69330efce62be1d5eb2b88988411517d53 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 3 Dec 2019 17:15:18 +0000 Subject: [PATCH 1/5] Fix query analyzer logic for mixed conjunctions of terms and ranges --- .../percolator/QueryAnalyzer.java | 30 ++++++--------- .../percolator/QueryAnalyzerTests.java | 37 +++++++++++++++++++ 2 files changed, 49 insertions(+), 18 deletions(-) diff --git a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java index 362c8870f652d..b846d4bde1e4e 100644 --- a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java +++ b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java @@ -232,7 +232,7 @@ private static Result handleConjunction(List conjunctionsWithUnknowns) { List conjunctions = conjunctionsWithUnknowns.stream().filter(r -> r.isUnknown() == false).collect(Collectors.toList()); if (conjunctions.isEmpty()) { if (conjunctionsWithUnknowns.isEmpty()) { - throw new IllegalArgumentException("Must have at least on conjunction sub result"); + throw new IllegalArgumentException("Must have at least one conjunction sub result"); } return conjunctionsWithUnknowns.get(0); // all conjunctions are unknown, so just return the first one } @@ -261,23 +261,17 @@ private static Result handleConjunction(List conjunctionsWithUnknowns) { for (QueryExtraction queryExtraction : result.extractions) { if (queryExtraction.range != null) { // In case of range queries each extraction does not simply increment the - // minimum_should_match - // for that percolator query like for a term based extraction, so that can lead - // to more false - // positives for percolator queries with range queries than term based queries. - // The is because the way number fields are extracted from the document to be - // percolated. - // Per field a single range is extracted and if a percolator query has two or - // more range queries - // on the same field, then the minimum should match can be higher than clauses - // in the CoveringQuery. - // Therefore right now the minimum should match is incremented once per number - // field when processing - // the percolator query at index time. - if (seenRangeFields.add(queryExtraction.range.fieldName)) { - resultMsm = 1; - } else { - resultMsm = 0; + // minimum_should_match for that percolator query like for a term based extraction, + // so that can lead to more false positives for percolator queries with range queries + // than term based queries. + // This is because the way number fields are extracted from the document to be + // percolated. Per field a single range is extracted and if a percolator query has two or + // more range queries on the same field, then the minimum should match can be higher than clauses + // in the CoveringQuery. Therefore right now the minimum should match is only incremented once per + // number field when processing the percolator query at index time. + if (seenRangeFields.add(queryExtraction.range.fieldName) == false) { + resultMsm = Math.max(0, resultMsm - 1); + verified = false; } } diff --git a/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java b/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java index 91c815c40322e..c4575b771d4ee 100644 --- a/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java +++ b/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java @@ -1208,4 +1208,41 @@ public void testIntervalQueries() { assertTermsEqual(result.extractions, new Term("field", "a")); } + public void testCombinedRangeAndTermWithMinimumShouldMatch() { + + Query disj = new BooleanQuery.Builder() + .add(IntPoint.newRangeQuery("i", 0, 10), Occur.SHOULD) + .add(new TermQuery(new Term("f", "v1")), Occur.SHOULD) + .add(new TermQuery(new Term("f", "v1")), Occur.SHOULD) + .setMinimumNumberShouldMatch(2) + .build(); + + Result r = analyze(disj, Version.CURRENT); + assertThat(r.minimumShouldMatch, equalTo(1)); + + Query q = new BooleanQuery.Builder() + .add(IntPoint.newRangeQuery("i", 0, 10), Occur.SHOULD) + .add(new TermQuery(new Term("f", "v1")), Occur.SHOULD) + .add(new TermQuery(new Term("f", "v1")), Occur.SHOULD) + .add(new TermQuery(new Term("f", "v1")), Occur.FILTER) + .setMinimumNumberShouldMatch(2) + .build(); + + Result result = analyze(q, Version.CURRENT); + assertThat(result.minimumShouldMatch, equalTo(1)); + assertThat(result.extractions.size(), equalTo(2)); + assertFalse(result.verified); + + Query q2 = new BooleanQuery.Builder() + .add(new TermQuery(new Term("f", "v1")), Occur.FILTER) + .add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD) + .add(new TermQuery(new Term("f", "v2")), Occur.SHOULD) + .add(new TermQuery(new Term("f", "v2")), Occur.MUST) + .setMinimumNumberShouldMatch(1) + .build(); + + result = analyze(q2, Version.CURRENT); + assertThat(result.minimumShouldMatch, equalTo(2)); + } + } From 85dba06848a916af248508b676efd144993aad22 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 3 Dec 2019 17:30:33 +0000 Subject: [PATCH 2/5] some more tests --- .../elasticsearch/percolator/QueryAnalyzerTests.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java b/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java index c4575b771d4ee..3cf72b3b9580d 100644 --- a/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java +++ b/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java @@ -1233,6 +1233,16 @@ public void testCombinedRangeAndTermWithMinimumShouldMatch() { assertThat(result.extractions.size(), equalTo(2)); assertFalse(result.verified); + q = new BooleanQuery.Builder() + .add(q, Occur.MUST) + .add(q, Occur.MUST) + .build(); + + result = analyze(q, Version.CURRENT); + assertThat(result.minimumShouldMatch, equalTo(1)); + assertThat(result.extractions.size(), equalTo(2)); + assertFalse(result.verified); + Query q2 = new BooleanQuery.Builder() .add(new TermQuery(new Term("f", "v1")), Occur.FILTER) .add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD) From 62d30e6597447d155cb3b942f10271925ea28f44 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 4 Dec 2019 10:20:18 +0000 Subject: [PATCH 3/5] more testing, tweak msm logic --- .../percolator/QueryAnalyzer.java | 9 ++- .../percolator/QueryAnalyzerTests.java | 68 +++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java index b846d4bde1e4e..7f6df3124595b 100644 --- a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java +++ b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java @@ -269,7 +269,11 @@ private static Result handleConjunction(List conjunctionsWithUnknowns) { // more range queries on the same field, then the minimum should match can be higher than clauses // in the CoveringQuery. Therefore right now the minimum should match is only incremented once per // number field when processing the percolator query at index time. - if (seenRangeFields.add(queryExtraction.range.fieldName) == false) { + // For multiple ranges within a single extraction (ie from an existing conjunction or disjunction) + // then this will already have been taken care of, so we only check against fieldnames from + // previously processed extractions, and don't add to the seenRangeFields list until all + // extractions from this result are processed + if (seenRangeFields.contains(queryExtraction.range.fieldName)) { resultMsm = Math.max(0, resultMsm - 1); verified = false; } @@ -280,6 +284,9 @@ private static Result handleConjunction(List conjunctionsWithUnknowns) { verified = false; } } + // add range fields from this Result to the seenRangeFields set so that minimumShouldMatch is correctly + // calculated for subsequent Results + result.extractions.stream().filter(e -> e.range != null).forEach(e -> seenRangeFields.add(e.range.fieldName)); msm += resultMsm; if (result.verified == false diff --git a/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java b/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java index 3cf72b3b9580d..f6d808b15706b 100644 --- a/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java +++ b/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java @@ -78,6 +78,7 @@ import static org.elasticsearch.percolator.QueryAnalyzer.analyze; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.is; +import static org.hamcrest.collection.IsCollectionWithSize.hasSize; public class QueryAnalyzerTests extends ESTestCase { @@ -1219,6 +1220,9 @@ public void testCombinedRangeAndTermWithMinimumShouldMatch() { Result r = analyze(disj, Version.CURRENT); assertThat(r.minimumShouldMatch, equalTo(1)); + assertThat(r.extractions, hasSize(2)); + assertFalse(r.matchAllDocs); + assertFalse(r.verified); Query q = new BooleanQuery.Builder() .add(IntPoint.newRangeQuery("i", 0, 10), Occur.SHOULD) @@ -1232,6 +1236,7 @@ public void testCombinedRangeAndTermWithMinimumShouldMatch() { assertThat(result.minimumShouldMatch, equalTo(1)); assertThat(result.extractions.size(), equalTo(2)); assertFalse(result.verified); + assertFalse(result.matchAllDocs); q = new BooleanQuery.Builder() .add(q, Occur.MUST) @@ -1242,6 +1247,7 @@ public void testCombinedRangeAndTermWithMinimumShouldMatch() { assertThat(result.minimumShouldMatch, equalTo(1)); assertThat(result.extractions.size(), equalTo(2)); assertFalse(result.verified); + assertFalse(result.matchAllDocs); Query q2 = new BooleanQuery.Builder() .add(new TermQuery(new Term("f", "v1")), Occur.FILTER) @@ -1253,6 +1259,68 @@ public void testCombinedRangeAndTermWithMinimumShouldMatch() { result = analyze(q2, Version.CURRENT); assertThat(result.minimumShouldMatch, equalTo(2)); + assertThat(result.extractions, hasSize(3)); + assertFalse(result.verified); + assertFalse(result.matchAllDocs); + + // multiple range queries on different fields + Query q3 = new BooleanQuery.Builder() + .add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD) + .add(IntPoint.newRangeQuery("i2", 15, 20), Occur.SHOULD) + .add(new TermQuery(new Term("f", "v1")), Occur.SHOULD) + .add(new TermQuery(new Term("f", "v2")), Occur.MUST) + .setMinimumNumberShouldMatch(1) + .build(); + result = analyze(q3, Version.CURRENT); + assertThat(result.minimumShouldMatch, equalTo(2)); + assertThat(result.extractions, hasSize(4)); + assertFalse(result.verified); + assertFalse(result.matchAllDocs); + + // multiple disjoint range queries on the same field + Query q4 = new BooleanQuery.Builder() + .add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD) + .add(IntPoint.newRangeQuery("i", 25, 30), Occur.SHOULD) + .add(IntPoint.newRangeQuery("i", 35, 40), Occur.SHOULD) + .add(new TermQuery(new Term("f", "v1")), Occur.SHOULD) + .add(new TermQuery(new Term("f", "v2")), Occur.MUST) + .setMinimumNumberShouldMatch(1) + .build(); + result = analyze(q4, Version.CURRENT); + assertThat(result.minimumShouldMatch, equalTo(2)); + assertThat(result.extractions, hasSize(5)); + assertFalse(result.verified); + assertFalse(result.matchAllDocs); + + // multiple conjunction range queries on the same field + Query q5 = new BooleanQuery.Builder() + .add(new BooleanQuery.Builder() + .add(IntPoint.newRangeQuery("i", 15, 20), Occur.MUST) + .add(IntPoint.newRangeQuery("i", 25, 30), Occur.MUST) + .build(), Occur.MUST) + .add(IntPoint.newRangeQuery("i", 35, 40), Occur.MUST) + .add(new TermQuery(new Term("f", "v2")), Occur.MUST) + .build(); + result = analyze(q5, Version.CURRENT); + assertThat(result.minimumShouldMatch, equalTo(2)); + assertThat(result.extractions, hasSize(4)); + assertFalse(result.verified); + assertFalse(result.matchAllDocs); + + // multiple conjunction range queries on different fields + Query q6 = new BooleanQuery.Builder() + .add(new BooleanQuery.Builder() + .add(IntPoint.newRangeQuery("i", 15, 20), Occur.MUST) + .add(IntPoint.newRangeQuery("i2", 25, 30), Occur.MUST) + .build(), Occur.MUST) + .add(IntPoint.newRangeQuery("i", 35, 40), Occur.MUST) + .add(new TermQuery(new Term("f", "v2")), Occur.MUST) + .build(); + result = analyze(q6, Version.CURRENT); + assertThat(result.minimumShouldMatch, equalTo(3)); + assertThat(result.extractions, hasSize(4)); + assertFalse(result.verified); + assertFalse(result.matchAllDocs); } } From f5e4e0f6116e8ed044256ea013ca4461c9bbfdf9 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 9 Dec 2019 16:34:48 +0000 Subject: [PATCH 4/5] feedback --- .../java/org/elasticsearch/percolator/QueryAnalyzer.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java index 7f6df3124595b..d185db87750b5 100644 --- a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java +++ b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java @@ -286,7 +286,11 @@ private static Result handleConjunction(List conjunctionsWithUnknowns) { } // add range fields from this Result to the seenRangeFields set so that minimumShouldMatch is correctly // calculated for subsequent Results - result.extractions.stream().filter(e -> e.range != null).forEach(e -> seenRangeFields.add(e.range.fieldName)); + result.extractions.stream() + .map(e -> e.range) + .filter(Objects::nonNull) + .map(e -> e.fieldName) + .forEach(seenRangeFields::add); msm += resultMsm; if (result.verified == false From 6d91f8d00c0b36037eb1abe7f048fab4ade6c977 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 10 Dec 2019 09:50:17 +0000 Subject: [PATCH 5/5] Don't reduce msm twice for duplicate range extractions --- .../percolator/QueryAnalyzer.java | 27 ++++++++++--------- .../percolator/QueryAnalyzerTests.java | 17 ++++++++++++ 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java index d185db87750b5..f08600cdfd0e9 100644 --- a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java +++ b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java @@ -247,16 +247,10 @@ private static Result handleConjunction(List conjunctionsWithUnknowns) { int msm = 0; boolean verified = conjunctionsWithUnknowns.size() == conjunctions.size(); boolean matchAllDocs = true; - boolean hasDuplicateTerms = false; Set extractions = new HashSet<>(); Set seenRangeFields = new HashSet<>(); for (Result result : conjunctions) { - // In case that there are duplicate query extractions we need to be careful with - // incrementing msm, - // because that could lead to valid matches not becoming candidate matches: - // query: (field:val1 AND field:val2) AND (field:val2 AND field:val3) - // doc: field: val1 val2 val3 - // So lets be protective and decrease the msm: + int resultMsm = result.minimumShouldMatch; for (QueryExtraction queryExtraction : result.extractions) { if (queryExtraction.range != null) { @@ -278,12 +272,20 @@ private static Result handleConjunction(List conjunctionsWithUnknowns) { verified = false; } } - - if (extractions.contains(queryExtraction)) { - resultMsm = Math.max(0, resultMsm - 1); - verified = false; + else { + // In case that there are duplicate term query extractions we need to be careful with + // incrementing msm, because that could lead to valid matches not becoming candidate matches: + // query: (field:val1 AND field:val2) AND (field:val2 AND field:val3) + // doc: field: val1 val2 val3 + // So lets be protective and decrease the msm: + if (extractions.contains(queryExtraction)) { + resultMsm = Math.max(0, resultMsm - 1); + verified = false; + } } } + msm += resultMsm; + // add range fields from this Result to the seenRangeFields set so that minimumShouldMatch is correctly // calculated for subsequent Results result.extractions.stream() @@ -291,7 +293,6 @@ private static Result handleConjunction(List conjunctionsWithUnknowns) { .filter(Objects::nonNull) .map(e -> e.fieldName) .forEach(seenRangeFields::add); - msm += resultMsm; if (result.verified == false // If some inner extractions are optional, the result can't be verified @@ -304,7 +305,7 @@ private static Result handleConjunction(List conjunctionsWithUnknowns) { if (matchAllDocs) { return new Result(matchAllDocs, verified); } else { - return new Result(verified, extractions, hasDuplicateTerms ? 1 : msm); + return new Result(verified, extractions, msm); } } diff --git a/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java b/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java index f6d808b15706b..1c00d0555b41a 100644 --- a/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java +++ b/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java @@ -1321,6 +1321,23 @@ public void testCombinedRangeAndTermWithMinimumShouldMatch() { assertThat(result.extractions, hasSize(4)); assertFalse(result.verified); assertFalse(result.matchAllDocs); + + // mixed term and range conjunctions + Query q7 = new BooleanQuery.Builder() + .add(new BooleanQuery.Builder() + .add(IntPoint.newRangeQuery("i", 1, 2), Occur.MUST) + .add(new TermQuery(new Term("f", "1")), Occur.MUST) + .build(), Occur.MUST) + .add(new BooleanQuery.Builder() + .add(IntPoint.newRangeQuery("i", 1, 2), Occur.MUST) + .add(new TermQuery(new Term("f", "2")), Occur.MUST) + .build(), Occur.MUST) + .build(); + result = analyze(q7, Version.CURRENT); + assertThat(result.minimumShouldMatch, equalTo(3)); + assertThat(result.extractions, hasSize(3)); + assertFalse(result.verified); + assertFalse(result.matchAllDocs); } }