From d730628886a9675d41b3afc6bfb66cb2880e9e2a Mon Sep 17 00:00:00 2001 From: "gesong.samuel" Date: Thu, 5 Sep 2024 15:16:06 +0800 Subject: [PATCH 1/3] fix escaped wildcard query on wildcard field Signed-off-by: gesong.samuel --- .../index/mapper/WildcardFieldMapper.java | 70 +++++++++++++------ .../index/mapper/WildcardFieldTypeTests.java | 32 +++++++++ 2 files changed, 82 insertions(+), 20 deletions(-) diff --git a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java index cd95e320209ee..3d21c13343b68 100644 --- a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java +++ b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java @@ -430,22 +430,27 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo finalValue = value; } Predicate matchPredicate; - if (value.contains("?")) { - Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), finalValue)); - CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton); + Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), finalValue)); + CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton); + if (compiledAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.SINGLE) { + // when type equals SINGLE, #compiledAutomaton.runAutomaton is null matchPredicate = s -> { if (caseInsensitive) { s = s.toLowerCase(Locale.ROOT); } - BytesRef valueBytes = BytesRefs.toBytesRef(s); - return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length); + return s.equals(finalValue); }; + } else if (compiledAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.ALL) { + return existsQuery(context); + } else if (compiledAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.NONE) { + return new MatchNoDocsQuery("Wildcard expression matches nothing"); } else { matchPredicate = s -> { if (caseInsensitive) { s = s.toLowerCase(Locale.ROOT); } - return Regex.simpleMatch(finalValue, s); + BytesRef valueBytes = BytesRefs.toBytesRef(s); + return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length); }; } @@ -468,22 +473,30 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo // Package-private for testing static Set getRequiredNGrams(String value) { Set terms = new HashSet<>(); + + if (value.isEmpty()) { + return terms; + } + int pos = 0; + String rawSequence = null; String currentSequence = null; if (!value.startsWith("?") && !value.startsWith("*")) { // Can add prefix term - currentSequence = getNonWildcardSequence(value, 0); + rawSequence = getNonWildcardSequence(value, 0); + currentSequence = performEscape(rawSequence); if (currentSequence.length() == 1) { - terms.add(new String(new char[] { 0, currentSequence.charAt(0) })); + terms.add(new String(new char[]{0, currentSequence.charAt(0)})); } else { - terms.add(new String(new char[] { 0, currentSequence.charAt(0), currentSequence.charAt(1) })); + terms.add(new String(new char[]{0, currentSequence.charAt(0), currentSequence.charAt(1)})); } } else { pos = findNonWildcardSequence(value, pos); - currentSequence = getNonWildcardSequence(value, pos); + rawSequence = getNonWildcardSequence(value, pos); } while (pos < value.length()) { - boolean isEndOfValue = pos + currentSequence.length() == value.length(); + boolean isEndOfValue = pos + rawSequence.length() == value.length(); + currentSequence = performEscape(rawSequence); if (!currentSequence.isEmpty() && currentSequence.length() < 3 && !isEndOfValue && pos > 0) { // If this is a prefix or suffix of length < 3, then we already have a longer token including the anchor. terms.add(currentSequence); @@ -495,15 +508,15 @@ static Set getRequiredNGrams(String value) { if (isEndOfValue) { // This is the end of the input. We can attach a suffix anchor. if (currentSequence.length() == 1) { - terms.add(new String(new char[] { currentSequence.charAt(0), 0 })); + terms.add(new String(new char[]{currentSequence.charAt(0), 0})); } else { char a = currentSequence.charAt(currentSequence.length() - 2); char b = currentSequence.charAt(currentSequence.length() - 1); - terms.add(new String(new char[] { a, b, 0 })); + terms.add(new String(new char[]{a, b, 0})); } } - pos = findNonWildcardSequence(value, pos + currentSequence.length()); - currentSequence = getNonWildcardSequence(value, pos); + pos = findNonWildcardSequence(value, pos + rawSequence.length()); + rawSequence = getNonWildcardSequence(value, pos); } return terms; } @@ -511,7 +524,8 @@ static Set getRequiredNGrams(String value) { private static String getNonWildcardSequence(String value, int startFrom) { for (int i = startFrom; i < value.length(); i++) { char c = value.charAt(i); - if (c == '?' || c == '*') { + if ((c == '?' || c == '*') && + (i == 0 || value.charAt(i - 1) != '\\')) { return value.substring(startFrom, i); } } @@ -529,6 +543,22 @@ private static int findNonWildcardSequence(String value, int startFrom) { return value.length(); } + private static String performEscape(String str) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < str.length(); i++) { + if (str.charAt(i) == '\\' && (i + 1) < str.length()) { + char c = str.charAt(i + 1); + if (c == '*' || c == '?') { + i++; + } + } + sb.append(str.charAt(i)); + } + assert !sb.toString().contains("\\*"); + assert !sb.toString().contains("\\?"); + return sb.toString(); + } + @Override public Query regexpQuery( String value, @@ -616,10 +646,10 @@ private static Query regexpToQuery(String fieldName, RegExp regExp) { query = builder.build(); } else if ((regExp.kind == RegExp.Kind.REGEXP_REPEAT_MIN || regExp.kind == RegExp.Kind.REGEXP_REPEAT_MINMAX) && regExp.min > 0) { - return regexpToQuery(fieldName, regExp.exp1); - } else { - return new MatchAllDocsQuery(); - } + return regexpToQuery(fieldName, regExp.exp1); + } else { + return new MatchAllDocsQuery(); + } if (query.clauses().size() == 1) { return query.iterator().next().getQuery(); } else if (query.clauses().size() == 0) { diff --git a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java index cd2a23cf94c37..f4c49e578bab0 100644 --- a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java +++ b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java @@ -88,6 +88,38 @@ public void testWildcardQuery() { ); } + public void testEscapedWildcardQuery() { + MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field"); + Set expectedTerms = new HashSet<>(); + expectedTerms.add(prefixAnchored("*")); + expectedTerms.add(suffixAnchored("*")); + + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (String term : expectedTerms) { + builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); + } + + assertEquals( + new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\**\\*"), + ft.wildcardQuery("\\**\\*", null, null) + ); + + assertEquals( + new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\*"), + ft.wildcardQuery("\\*", null, null) + ); + + expectedTerms.remove(suffixAnchored("*")); + builder = new BooleanQuery.Builder(); + for (String term : expectedTerms) { + builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); + } + assertEquals( + new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\**"), + ft.wildcardQuery("\\**", null, null) + ); + } + public void testMultipleWildcardsInQuery() { final String pattern = "a?cd*efg?h"; MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field"); From dfc317b1ba43c554421c457da662c301c24266bc Mon Sep 17 00:00:00 2001 From: "gesong.samuel" Date: Thu, 5 Sep 2024 16:11:59 +0800 Subject: [PATCH 2/3] fix format error Signed-off-by: gesong.samuel --- .../index/mapper/WildcardFieldMapper.java | 20 +++++++++---------- .../index/mapper/WildcardFieldTypeTests.java | 10 ++-------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java index 3d21c13343b68..0cb416a9b8370 100644 --- a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java +++ b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java @@ -40,7 +40,6 @@ import org.apache.lucene.util.automaton.RegExp; import org.opensearch.common.lucene.BytesRefs; import org.opensearch.common.lucene.Lucene; -import org.opensearch.common.regex.Regex; import org.opensearch.common.unit.Fuzziness; import org.opensearch.core.xcontent.XContentParser; import org.opensearch.index.analysis.IndexAnalyzers; @@ -486,9 +485,9 @@ static Set getRequiredNGrams(String value) { rawSequence = getNonWildcardSequence(value, 0); currentSequence = performEscape(rawSequence); if (currentSequence.length() == 1) { - terms.add(new String(new char[]{0, currentSequence.charAt(0)})); + terms.add(new String(new char[] { 0, currentSequence.charAt(0) })); } else { - terms.add(new String(new char[]{0, currentSequence.charAt(0), currentSequence.charAt(1)})); + terms.add(new String(new char[] { 0, currentSequence.charAt(0), currentSequence.charAt(1) })); } } else { pos = findNonWildcardSequence(value, pos); @@ -508,11 +507,11 @@ static Set getRequiredNGrams(String value) { if (isEndOfValue) { // This is the end of the input. We can attach a suffix anchor. if (currentSequence.length() == 1) { - terms.add(new String(new char[]{currentSequence.charAt(0), 0})); + terms.add(new String(new char[] { currentSequence.charAt(0), 0 })); } else { char a = currentSequence.charAt(currentSequence.length() - 2); char b = currentSequence.charAt(currentSequence.length() - 1); - terms.add(new String(new char[]{a, b, 0})); + terms.add(new String(new char[] { a, b, 0 })); } } pos = findNonWildcardSequence(value, pos + rawSequence.length()); @@ -524,8 +523,7 @@ static Set getRequiredNGrams(String value) { private static String getNonWildcardSequence(String value, int startFrom) { for (int i = startFrom; i < value.length(); i++) { char c = value.charAt(i); - if ((c == '?' || c == '*') && - (i == 0 || value.charAt(i - 1) != '\\')) { + if ((c == '?' || c == '*') && (i == 0 || value.charAt(i - 1) != '\\')) { return value.substring(startFrom, i); } } @@ -646,10 +644,10 @@ private static Query regexpToQuery(String fieldName, RegExp regExp) { query = builder.build(); } else if ((regExp.kind == RegExp.Kind.REGEXP_REPEAT_MIN || regExp.kind == RegExp.Kind.REGEXP_REPEAT_MINMAX) && regExp.min > 0) { - return regexpToQuery(fieldName, regExp.exp1); - } else { - return new MatchAllDocsQuery(); - } + return regexpToQuery(fieldName, regExp.exp1); + } else { + return new MatchAllDocsQuery(); + } if (query.clauses().size() == 1) { return query.iterator().next().getQuery(); } else if (query.clauses().size() == 0) { diff --git a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java index f4c49e578bab0..1a813495e9033 100644 --- a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java +++ b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java @@ -104,20 +104,14 @@ public void testEscapedWildcardQuery() { ft.wildcardQuery("\\**\\*", null, null) ); - assertEquals( - new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\*"), - ft.wildcardQuery("\\*", null, null) - ); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\*"), ft.wildcardQuery("\\*", null, null)); expectedTerms.remove(suffixAnchored("*")); builder = new BooleanQuery.Builder(); for (String term : expectedTerms) { builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); } - assertEquals( - new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\**"), - ft.wildcardQuery("\\**", null, null) - ); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\**"), ft.wildcardQuery("\\**", null, null)); } public void testMultipleWildcardsInQuery() { From 1d9765c7bc417f9eecbff54df73d6d2e102f25a4 Mon Sep 17 00:00:00 2001 From: "gesong.samuel" Date: Mon, 9 Sep 2024 15:31:37 +0800 Subject: [PATCH 3/3] add change log Signed-off-by: gesong.samuel --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c95f84d78842..4ad0f93bed9dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -117,7 +117,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Fix null values indexed as "null" strings in flat_object field ([#14069](https://github.com/opensearch-project/OpenSearch/pull/14069)) - Fix terms query on wildcard field returns nothing ([#15607](https://github.com/opensearch-project/OpenSearch/pull/15607)) - Fix remote snapshot file_cache exceeding capacity ([#15077](https://github.com/opensearch-project/OpenSearch/pull/15077)) - +- Fix wildcard query containing escaped character ([#15737](https://github.com/opensearch-project/OpenSearch/pull/15737)) ### Security [Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/2.15...2.x