From baf369f8fdaf0dab7f618d1fe7d558d41238a852 Mon Sep 17 00:00:00 2001 From: markharwood Date: Thu, 5 Dec 2019 11:11:28 +0000 Subject: [PATCH 01/32] First cut at Wildcard field optimised for wildcard queries Closes #48852 --- .../license/XPackLicenseState.java | 10 + .../elasticsearch/xpack/core/XPackField.java | 2 + .../core/action/XPackInfoFeatureAction.java | 1 + .../core/action/XPackUsageFeatureAction.java | 1 + .../wildcard/WildcardFeatureSetUsage.java | 60 ++ .../test/wildcard/10_wildcard_basic.yml | 177 ++++++ x-pack/plugin/wildcard/build.gradle | 18 + .../xpack/wildcard/Wildcard.java | 45 ++ .../wildcard/WildcardInfoTransportAction.java | 43 ++ .../WildcardUsageTransportAction.java | 74 +++ .../mapper/TaperedNgramTokenFilter.java | 104 ++++ .../wildcard/mapper/WildcardFieldMapper.java | 581 ++++++++++++++++++ .../mapper/WildcardOnBinaryDvQuery.java | 95 +++ .../wildcard/mapper/WildcardOnDvQuery.java | 99 +++ .../mapper/WildcardPositionBasedQuery.java | 203 ++++++ .../xpack/wildcard/WildcardFieldTests.java | 233 +++++++ .../WildcardInfoTransportActionTests.java | 53 ++ .../mapper/TaperedNgramTokenFilterTests.java | 77 +++ .../mapper/WildcardFieldMapperTests.java | 168 +++++ .../mapper/WildcardFieldTypeTests.java | 19 + 20 files changed, 2063 insertions(+) create mode 100644 x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/wildcard/WildcardFeatureSetUsage.java create mode 100644 x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml create mode 100644 x-pack/plugin/wildcard/build.gradle create mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/Wildcard.java create mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportAction.java create mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardUsageTransportAction.java create mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilter.java create mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java create mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java create mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnDvQuery.java create mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardPositionBasedQuery.java create mode 100644 x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardFieldTests.java create mode 100644 x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportActionTests.java create mode 100644 x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilterTests.java create mode 100644 x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java create mode 100644 x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/license/XPackLicenseState.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/license/XPackLicenseState.java index 1d90ef9488789..4e58023eb260b 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/license/XPackLicenseState.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/license/XPackLicenseState.java @@ -613,6 +613,16 @@ public boolean isFlattenedAllowed() { public boolean isVectorsAllowed() { return allowForAllLicenses(); } + + + /** + * Determine if Wildcard support should be enabled. + *

+ * Wildcard is available for all license types except {@link OperationMode#MISSING} + */ + public synchronized boolean isWildcardAllowed() { + return status.active; + } public boolean isOdbcAllowed() { return isAllowedByLicense(OperationMode.PLATINUM); diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackField.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackField.java index 3bc1a44e7b820..e5caa3f4322be 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackField.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackField.java @@ -44,6 +44,8 @@ public final class XPackField { /** Name constant for flattened fields. */ /** Name constant for the vectors feature. */ public static final String VECTORS = "vectors"; + /** Name constant for the wildcard feature. */ + public static final String WILDCARD = "wildcard"; /** Name constant for the voting-only-node feature. */ public static final String VOTING_ONLY = "voting_only"; /** Name constant for the frozen index feature. */ diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackInfoFeatureAction.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackInfoFeatureAction.java index 0d97119434cc3..dae576f3eaef8 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackInfoFeatureAction.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackInfoFeatureAction.java @@ -36,6 +36,7 @@ public class XPackInfoFeatureAction extends ActionType public static final XPackInfoFeatureAction CCR = new XPackInfoFeatureAction(XPackField.CCR); public static final XPackInfoFeatureAction TRANSFORM = new XPackInfoFeatureAction(XPackField.TRANSFORM); public static final XPackInfoFeatureAction VECTORS = new XPackInfoFeatureAction(XPackField.VECTORS); + public static final XPackInfoFeatureAction WILDCARD = new XPackInfoFeatureAction(XPackField.WILDCARD); public static final XPackInfoFeatureAction VOTING_ONLY = new XPackInfoFeatureAction(XPackField.VOTING_ONLY); public static final XPackInfoFeatureAction FROZEN_INDICES = new XPackInfoFeatureAction(XPackField.FROZEN_INDICES); public static final XPackInfoFeatureAction SPATIAL = new XPackInfoFeatureAction(XPackField.SPATIAL); diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackUsageFeatureAction.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackUsageFeatureAction.java index c696fdeaa3e29..afa3abe7fbb20 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackUsageFeatureAction.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackUsageFeatureAction.java @@ -36,6 +36,7 @@ public class XPackUsageFeatureAction extends ActionType> getActions() { + return Arrays.asList( + new ActionPlugin.ActionHandler<>(XPackUsageFeatureAction.WILDCARD, WildcardUsageTransportAction.class), + new ActionPlugin.ActionHandler<>(XPackInfoFeatureAction.WILDCARD, WildcardInfoTransportAction.class)); + } + + @Override + public Map getMappers() { + Map mappers = new LinkedHashMap<>(); + mappers.put(WildcardFieldMapper.CONTENT_TYPE, new WildcardFieldMapper.TypeParser()); + return Collections.unmodifiableMap(mappers); + } +} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportAction.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportAction.java new file mode 100644 index 0000000000000..fea55a9c0662a --- /dev/null +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportAction.java @@ -0,0 +1,43 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.wildcard; + +import org.elasticsearch.action.support.ActionFilters; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.license.XPackLicenseState; +import org.elasticsearch.transport.TransportService; +import org.elasticsearch.xpack.core.XPackField; +import org.elasticsearch.xpack.core.action.XPackInfoFeatureAction; +import org.elasticsearch.xpack.core.action.XPackInfoFeatureTransportAction; + +public class WildcardInfoTransportAction extends XPackInfoFeatureTransportAction { + + private final XPackLicenseState licenseState; + + @Inject + public WildcardInfoTransportAction(TransportService transportService, ActionFilters actionFilters, + Settings settings, XPackLicenseState licenseState) { + super(XPackInfoFeatureAction.WILDCARD.name(), transportService, actionFilters); + this.licenseState = licenseState; + } + + @Override + public String name() { + return XPackField.WILDCARD; + } + + @Override + public boolean available() { + return licenseState.isWildcardAllowed(); + } + + @Override + public boolean enabled() { + return true; + } + +} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardUsageTransportAction.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardUsageTransportAction.java new file mode 100644 index 0000000000000..96bb8452034ee --- /dev/null +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardUsageTransportAction.java @@ -0,0 +1,74 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.wildcard; + +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.action.support.ActionFilters; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; +import org.elasticsearch.cluster.metadata.MappingMetaData; +import org.elasticsearch.cluster.service.ClusterService; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.license.XPackLicenseState; +import org.elasticsearch.protocol.xpack.XPackUsageRequest; +import org.elasticsearch.tasks.Task; +import org.elasticsearch.threadpool.ThreadPool; +import org.elasticsearch.transport.TransportService; +import org.elasticsearch.xpack.core.action.XPackUsageFeatureAction; +import org.elasticsearch.xpack.core.action.XPackUsageFeatureResponse; +import org.elasticsearch.xpack.core.action.XPackUsageFeatureTransportAction; +import org.elasticsearch.xpack.core.wildcard.WildcardFeatureSetUsage; +import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper; + +import java.util.Map; + +public class WildcardUsageTransportAction extends XPackUsageFeatureTransportAction { + + private final Settings settings; + private final XPackLicenseState licenseState; + + @Inject + public WildcardUsageTransportAction(TransportService transportService, ClusterService clusterService, ThreadPool threadPool, + ActionFilters actionFilters, IndexNameExpressionResolver indexNameExpressionResolver, + Settings settings, XPackLicenseState licenseState) { + super(XPackUsageFeatureAction.WILDCARD.name(), transportService, clusterService, + threadPool, actionFilters, indexNameExpressionResolver); + this.settings = settings; + this.licenseState = licenseState; + } + + @Override + protected void masterOperation(Task task, XPackUsageRequest request, ClusterState state, + ActionListener listener) { + boolean wildcardAvailable = licenseState.isWildcardAllowed(); + int numWildcardFields = 0; + + if (wildcardAvailable && state != null) { + for (IndexMetaData indexMetaData : state.metaData()) { + MappingMetaData mappingMetaData = indexMetaData.mapping(); + if (mappingMetaData != null) { + Map mappings = mappingMetaData.getSourceAsMap(); + if (mappings.containsKey("properties")) { + @SuppressWarnings("unchecked") Map> fieldMappings = + (Map>) mappings.get("properties"); + for (Map typeDefinition : fieldMappings.values()) { + String fieldType = (String) typeDefinition.get("type"); + if (fieldType != null) { + if (fieldType.equals(WildcardFieldMapper.CONTENT_TYPE)) { + numWildcardFields++; + } + } + } + } + } + } + } + WildcardFeatureSetUsage usage = new WildcardFeatureSetUsage(wildcardAvailable, numWildcardFields); + listener.onResponse(new XPackUsageFeatureResponse(usage)); + } +} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilter.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilter.java new file mode 100644 index 0000000000000..8084e4b0f95ca --- /dev/null +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilter.java @@ -0,0 +1,104 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +import java.io.IOException; + +/** + * Variation on Lucene's NGramTokenFilter that uses smaller (1 character) ngrams for the final few characters in a string. Helps improve + * performance of short suffix queries e.g. "*.exe" + */ +public final class TaperedNgramTokenFilter extends TokenFilter { + + private final int maxGram; + + private char[] curTermBuffer; + private int curTermLength; + private int curTermCodePointCount; + private int curGramSize; + private int curPos; + private int curPosIncr; + private State state; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + + /** + * Creates a TaperedNgramTokenFilter that, for a given input term, produces all contained n-grams with length = maxGram. Will generate + * small ngrams from maxGram down to 1 for the end of the input token. + * + * Note: Care must be taken when choosing maxGram; depending on the input token size, this filter potentially produces a huge number of + * unique terms in the index. + * + * @param input + * {@link TokenStream} holding the input to be tokenized + * @param maxGram + * the maximum length of the generated n-grams (apart from those at tail) + */ + public TaperedNgramTokenFilter(TokenStream input, int maxGram) { + super(input); + if (maxGram < 1) { + throw new IllegalArgumentException("maxGram must be greater than zero"); + } + this.maxGram = maxGram; + } + + @Override + public boolean incrementToken() throws IOException { + while (true) { + if (curTermBuffer == null) { + if (!input.incrementToken()) { + return false; + } + state = captureState(); + + curTermLength = termAtt.length(); + curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); + curPosIncr += posIncrAtt.getPositionIncrement(); + curPos = -1; + + curTermBuffer = termAtt.buffer().clone(); + curGramSize = Math.min(curTermCodePointCount, maxGram); + } + curPos++; + if ( (curPos + curGramSize) > curTermCodePointCount) { + // Reached near the end of the string. Start tapering token size down to 1 + curGramSize = curTermCodePointCount - curPos; + } + if (curGramSize > 0) { + restoreState(state); + final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); + final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); + termAtt.copyBuffer(curTermBuffer, start, end - start); + + posIncrAtt.setPositionIncrement(curPosIncr); + return true; + } + + // Done with this input token, get next token on next iteration. + curTermBuffer = null; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + curTermBuffer = null; + curPosIncr = 0; + } + + @Override + public void end() throws IOException { + super.end(); + posIncrAtt.setPositionIncrement(curPosIncr); + } +} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java new file mode 100644 index 0000000000000..8fd16f40b83e3 --- /dev/null +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -0,0 +1,581 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.DocValuesFieldExistsQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.MultiTermQuery.RewriteMethod; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.common.lucene.BytesRefs; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.index.query.QueryShardContext; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.index.mapper.TypeParsers.parseField; + +/** + * A {@link FieldMapper} for indexing a keyword fields with ngrams for efficient wildcard matching + */ +public class WildcardFieldMapper extends FieldMapper { + + public static final String CONTENT_TYPE = "wildcard_keyword"; + public static short MAX_NUM_CHARS_COUNT = 6; //maximum allowed number of characters per ngram + + public static class Defaults { + public static final MappedFieldType FIELD_TYPE = new WildcardFieldType(); + + static { + FIELD_TYPE.setTokenized(true); + FIELD_TYPE.setIndexOptions(IndexOptions.DOCS); + FIELD_TYPE.setStoreTermVectorOffsets(false); + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.freeze(); + } + public static final int IGNORE_ABOVE = Integer.MAX_VALUE; + public static final String MATCH_TYPE_DOC_VALUES = "doc_values"; + public static final String MATCH_TYPE_BINARY_DOC_VALUES = "binary_doc_values"; + public static final String MATCH_TYPE_POSITION = "positions"; + + } + + public static class Builder extends FieldMapper.Builder { + private int numChars = 3; + protected int ignoreAbove = Defaults.IGNORE_ABOVE; + protected String matchType = Defaults.MATCH_TYPE_BINARY_DOC_VALUES; + + + public Builder(String name) { + super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); + builder = this; + } + + @Override + public Builder docValues(boolean docValues) { + if(docValues == false) { + throw new MapperParsingException("The field [" + name + + "] cannot have doc values = false"); + } + return this; + } + + @Override + public Builder index(boolean index) { + if(index == false) { + throw new MapperParsingException("The field [" + name + + "] cannot have index = false"); + } + return this; + } + + public Builder numChars(int numChars) { + if ((numChars > MAX_NUM_CHARS_COUNT) || (numChars < 1)) { + throw new MapperParsingException("The number of characters for ngrams in field [" + name + + "] should be in the range [1, " + MAX_NUM_CHARS_COUNT + "]"); + } + this.numChars = numChars; + return this; + } + + public Builder ignoreAbove(int ignoreAbove) { + if (ignoreAbove < 0) { + throw new IllegalArgumentException("[ignore_above] must be positive, got " + ignoreAbove); + } + this.ignoreAbove = ignoreAbove; + return this; + } + + public Builder matchType(String matchType) { + if (Defaults.MATCH_TYPE_DOC_VALUES.equals(matchType) == false && + Defaults.MATCH_TYPE_BINARY_DOC_VALUES.equals(matchType) == false && + Defaults.MATCH_TYPE_POSITION.equals(matchType) == false) { + throw new IllegalArgumentException("[match_type] must be " + Defaults.MATCH_TYPE_DOC_VALUES + " or " + + Defaults.MATCH_TYPE_BINARY_DOC_VALUES + " or " + + Defaults.MATCH_TYPE_POSITION + ", got " + matchType); + } + this.matchType = matchType; + return this; + } + + @Override + protected void setupFieldType(BuilderContext context) { + super.setupFieldType(context); + fieldType().setNumChars(numChars); + fieldType().setMatchType(matchType); + if (matchType.equals(Defaults.MATCH_TYPE_POSITION)) { + fieldType().setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + } else { + fieldType().setIndexOptions(IndexOptions.DOCS); + } + } + + @Override + public WildcardFieldType fieldType() { + return (WildcardFieldType) super.fieldType(); + } + + @Override + public WildcardFieldMapper build(BuilderContext context) { + setupFieldType(context); + return new WildcardFieldMapper( + name, fieldType, defaultFieldType, ignoreAbove, matchType, numChars, + context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo); + } + + @Override + public Builder indexOptions(IndexOptions indexOptions) { + // Suspected parse sequencing problem here - if match_type not set yet we don't know + // if this is appropriate or not.... +// if (matchType.equals(Defaults.MATCH_TYPE_DOC_VALUES)) { +// if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) > 0) { +// throw new IllegalArgumentException("The [wildcard] field does not support positions with match_type " +// + Defaults.MATCH_TYPE_DOC_VALUES+", got [index_options]=" +// + indexOptionToString(indexOptions)); +// } +// } + return super.indexOptions(indexOptions); + } + + } + + public static class TypeParser implements Mapper.TypeParser { + @Override + public Mapper.Builder parse(String name, Map node, ParserContext parserContext) + throws MapperParsingException { + WildcardFieldMapper.Builder builder = new WildcardFieldMapper.Builder(name); + parseField(builder, name, node, parserContext); + + for (Iterator> iterator = node.entrySet().iterator(); iterator.hasNext();) { + Map.Entry entry = iterator.next(); + String propName = entry.getKey(); + Object propNode = entry.getValue(); + if (propName.equals("num_chars")) { + if (propNode == null) { + throw new MapperParsingException("Property [numChars] cannot be null."); + } + builder.numChars(XContentMapValues.nodeIntegerValue(propNode)); + iterator.remove(); + } else if (propName.equals("ignore_above")) { + builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1)); + iterator.remove(); + } else if (propName.equals("match_type")) { + builder.matchType(XContentMapValues.nodeStringValue(propNode, Defaults.MATCH_TYPE_BINARY_DOC_VALUES)); + iterator.remove(); + } + } + + return builder; + } + } + + public static final char TOKEN_START_OR_END_CHAR = 0; + // A visible character to aid debug +// public static final char TOKEN_START_OR_END_CHAR = '$'; + + + public static final class WildcardFieldType extends MappedFieldType { + private int numChars; + private String matchType; + + public WildcardFieldType() { + setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); + setSearchAnalyzer(Lucene.KEYWORD_ANALYZER); + } + + protected WildcardFieldType(WildcardFieldType ref) { + super(ref); + } + + public WildcardFieldType clone() { + return new WildcardFieldType(this); + } + + + // Holds parsed information about the wildcard pattern + static class PatternStructure { + boolean openStart, openEnd, hasSymbols; + int lastGap =0; + int wildcardCharCount, wildcardStringCount; + String[] fragments; + Integer [] precedingGapSizes; + final String pattern; + + @SuppressWarnings("fallthrough") // Intentionally uses fallthrough mirroring implementation in Lucene's WildcardQuery + PatternStructure (String wildcardText) { + this.pattern = wildcardText; + ArrayList fragmentList = new ArrayList<>(); + ArrayList precedingGapSizeList = new ArrayList<>(); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < wildcardText.length();) { + final int c = wildcardText.codePointAt(i); + int length = Character.charCount(c); + switch (c) { + case WildcardQuery.WILDCARD_STRING: + if (i == 0) { + openStart = true; + } + openEnd = true; + hasSymbols = true; + wildcardStringCount++; + + if (sb.length() > 0) { + precedingGapSizeList.add(lastGap); + fragmentList.add(sb.toString()); + sb = new StringBuilder(); + } + lastGap = Integer.MAX_VALUE; + break; + case WildcardQuery.WILDCARD_CHAR: + if (i == 0) { + openStart = true; + } + hasSymbols = true; + wildcardCharCount++; + openEnd = true; + if (sb.length() > 0) { + precedingGapSizeList.add(lastGap); + fragmentList.add(sb.toString()); + sb = new StringBuilder(); + lastGap = 0; + } + + if (lastGap != Integer.MAX_VALUE) { + lastGap++; + } + break; + case WildcardQuery.WILDCARD_ESCAPE: + // add the next codepoint instead, if it exists + if (i + length < wildcardText.length()) { + final int nextChar = wildcardText.codePointAt(i + length); + length += Character.charCount(nextChar); + sb.append(Character.toChars(nextChar)); + openEnd = false; + break; + } // else fallthru, lenient parsing with a trailing \ + default: + openEnd = false; + sb.append(Character.toChars(c)); + } + i += length; + } + if (sb.length() > 0) { + precedingGapSizeList.add(lastGap); + fragmentList.add(sb.toString()); + lastGap = 0; + } + fragments = fragmentList.toArray(new String[0]); + precedingGapSizes = precedingGapSizeList.toArray(new Integer[0]); + + } + + public boolean needsVerification() { + // Return true if term queries are not enough evidence + if (fragments.length == 1 && wildcardCharCount == 0) { + // The one case where we don't need verification is when + // we have a single fragment and no ? characters + return false; + } + return true; + } + + // Returns number of positions for last gap (Integer.MAX means unlimited gap) + public int getPrecedingGapSize(int fragmentNum) { + return precedingGapSizes[fragmentNum]; + } + + public boolean isMatchAll() { + return fragments.length == 0 && wildcardStringCount >0 && wildcardCharCount ==0; + } + + @Override + public int hashCode() { + return pattern.hashCode(); + } + + @Override + public boolean equals(Object obj) { + PatternStructure other = (PatternStructure) obj; + return pattern.equals(other.pattern); + } + + + } + + + @Override + public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) { + PatternStructure patternStructure = new PatternStructure(wildcardPattern); + + + if (matchType.equals(Defaults.MATCH_TYPE_POSITION)) { + return new WildcardPositionBasedQuery(name(), patternStructure, numChars); + } + + + BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); + + for (int i = 0; i < patternStructure.fragments.length; i++) { + String fragment = patternStructure.fragments[i]; + int fLength = fragment.length(); + if (fLength == 0) { + continue; + } + + // Add any start/end of string character + if (i == 0 && patternStructure.openStart == false) { + // Start-of-string anchored (is not a leading wildcard) + fragment = TOKEN_START_OR_END_CHAR + fragment; + } + if (patternStructure.openEnd == false && i == patternStructure.fragments.length - 1) { + // End-of-string anchored (is not a trailing wildcard) + fragment = fragment + TOKEN_START_OR_END_CHAR; + } + + if (fragment.length() == numChars) { + TermQuery tq = new TermQuery(new Term(name(), fragment)); + bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + } else if (fragment.length() > numChars) { + // Break fragment into multiple Ngrams + KeywordTokenizer kt = new KeywordTokenizer(256); + kt.setReader(new StringReader(fragment)); + TokenFilter filter = new NGramTokenFilter(kt, numChars, numChars, false); + CharTermAttribute termAtt = filter.addAttribute(CharTermAttribute.class); + String lastUnusedToken = null; + try { + filter.reset(); + int nextRequiredCoverage = 0; + int charPos = 0; + // minimise number of terms searched - eg for "1234567" and 4grams we only need terms + // `1234` and `4567` - no need to search for 2345 and 3456 + while (filter.incrementToken()) { + if (charPos == nextRequiredCoverage) { + TermQuery tq = new TermQuery(new Term(name(), termAtt.toString())); + bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + nextRequiredCoverage = charPos + termAtt.length() - 1; + } else { + lastUnusedToken = termAtt.toString(); + } + charPos++; + } + if (lastUnusedToken != null) { + // given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing + // `ake` to complete the logic. + TermQuery tq = new TermQuery(new Term(name(), lastUnusedToken)); + bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + } + kt.end(); + kt.close(); + } catch(IOException ioe) { + throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment ["+fragment+"]"); + } + } else { + // fragment is smaller than smallest ngram size + if (patternStructure.openEnd || i < patternStructure.fragments.length - 1) { + // fragment occurs mid-string so will need a wildcard query + WildcardQuery wq = new WildcardQuery(new Term(name(),fragment+"*")); + wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE); + bqBuilder.add(new BooleanClause(wq, Occur.MUST)); + } else { + // fragment occurs at end of string so can rely on Jim's indexing rule to optimise + // *foo by indexing smaller ngrams at the end of a string + TermQuery tq = new TermQuery(new Term(name(), fragment)); + bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + } + } + } + + BooleanQuery approximation = bqBuilder.build(); + if (patternStructure.isMatchAll()) { + return new MatchAllDocsQuery(); + } + if (approximation.clauses().size() > 1 || patternStructure.needsVerification()) { + BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder(); + verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST)); + if (matchType.equals(Defaults.MATCH_TYPE_DOC_VALUES)) { + verifyingBuilder.add(new BooleanClause(new WildcardOnDvQuery(name(), wildcardPattern), Occur.MUST)); + } else { + assert matchType.equals(Defaults.MATCH_TYPE_BINARY_DOC_VALUES); + verifyingBuilder.add(new BooleanClause(new WildcardOnBinaryDvQuery(name(), wildcardPattern), Occur.MUST)); + } + return verifyingBuilder.build(); + } + return approximation; + } + + int numChars() { + return numChars; + } + + void setNumChars(int numChars) { + this.numChars = numChars; + } + + void setMatchType(String matchType) { + this.matchType = matchType; + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public Query existsQuery(QueryShardContext context) { + return new DocValuesFieldExistsQuery(name()); + } + + @Override + public Query termQuery(Object value, QueryShardContext context) { + return wildcardQuery(BytesRefs.toString(value), MultiTermQuery.CONSTANT_SCORE_REWRITE, context); + } + + @Override + public Query termsQuery(List values, QueryShardContext context) { + BooleanQuery.Builder bq = new BooleanQuery.Builder(); + for (Object value : values) { + bq.add(termQuery(value, context), Occur.SHOULD); + } + return new ConstantScoreQuery(bq.build()); + } + } + + private int ignoreAbove; + private int numChars; + private String matchType; + + private WildcardFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, + int ignoreAbove, String matchType, int numChars, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { + super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); + this.ignoreAbove = ignoreAbove; + this.matchType = matchType; + this.numChars = numChars; + if (matchType.equals(Defaults.MATCH_TYPE_POSITION)) { + assert fieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; + + } else { + assert fieldType.indexOptions() == IndexOptions.DOCS; + + } + } + + + /** Values that have more chars than the return value of this method will + * be skipped at parsing time. */ + // pkg-private for testing + int ignoreAbove() { + return ignoreAbove; + } + + @Override + protected WildcardFieldMapper clone() { + return (WildcardFieldMapper) super.clone(); + } + + @Override + public WildcardFieldType fieldType() { + return (WildcardFieldType) super.fieldType(); + } + + @Override + protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException { + super.doXContentBody(builder, includeDefaults, params); + builder.field("num_chars", fieldType().numChars()); + if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) { + builder.field("ignore_above", ignoreAbove); + } + if (includeDefaults || matchType != Defaults.MATCH_TYPE_BINARY_DOC_VALUES) { + builder.field("match_type", matchType); + } + } + + @Override + protected void parseCreateField(ParseContext context, List fields) throws IOException { + final String value; + if (context.externalValueSet()) { + value = context.externalValue().toString(); + } else { + XContentParser parser = context.parser(); + if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { + value = fieldType().nullValueAsString(); + } else { + value = parser.textOrNull(); + } + } + + if (value == null || value.length() > ignoreAbove) { + return; + } + + createFields(value, fields); + } + + void createFields(String value, Listfields) { + KeywordTokenizer kt = new KeywordTokenizer(256); + kt.setReader(new StringReader(TOKEN_START_OR_END_CHAR+ value +TOKEN_START_OR_END_CHAR)); + TokenFilter filter = new TaperedNgramTokenFilter(kt, fieldType().numChars); + + Field field = new Field(fieldType().name(), filter, fieldType()); + fields.add(field); + + if (matchType.equals(Defaults.MATCH_TYPE_DOC_VALUES)) { + Field dvField = new SortedSetDocValuesField(fieldType().name(), new BytesRef(value)); + fields.add(dvField); + } + if (matchType.equals(Defaults.MATCH_TYPE_BINARY_DOC_VALUES)) { + Field dvField = new BinaryDocValuesField(fieldType().name(), new BytesRef(value)); + fields.add(dvField); + } + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + + @Override + protected void doMerge(Mapper mergeWith) { + super.doMerge(mergeWith); + this.ignoreAbove = ((WildcardFieldMapper) mergeWith).ignoreAbove; + this.matchType = ((WildcardFieldMapper) mergeWith).matchType; + this.numChars = ((WildcardFieldMapper) mergeWith).numChars; + } +} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java new file mode 100644 index 0000000000000..f058f8b0fada9 --- /dev/null +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java @@ -0,0 +1,95 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreScorer; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunAutomaton; + +import java.io.IOException; +import java.util.Objects; + +/** + * Query that runs a wildcard pattern across all binary doc values. + * Expensive to run so normally used in conjunction with more selective query clauses. + */ +public class WildcardOnBinaryDvQuery extends Query { + + private final String field; + private final String wildcardPattern; + + public WildcardOnBinaryDvQuery(String field, String wildcardPattern) { + this.field = field; + this.wildcardPattern = wildcardPattern; + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + + Automaton automaton = WildcardQuery.toAutomaton(new Term(field,wildcardPattern)); + ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton); + + return new ConstantScoreWeight(this, boost) { + + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + final BinaryDocValues values = DocValues.getBinary(context.reader(), field); + TwoPhaseIterator twoPhase = new TwoPhaseIterator(values) { + @Override + public boolean matches() throws IOException { + if (values.advanceExact(approximation.docID())) { + BytesRef value = values.binaryValue(); + return bytesMatcher.run(value.bytes, value.offset, value.length); + } + return false; + } + + @Override + public float matchCost() { + // TODO: how can we compute this? + return 1000f; + } + }; + return new ConstantScoreScorer(this, score(), scoreMode, twoPhase); + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return true; + } + }; + } + @Override + public String toString(String field) { + return field+":"+wildcardPattern; + } + + @Override + public boolean equals(Object obj) { + WildcardOnBinaryDvQuery other = (WildcardOnBinaryDvQuery) obj; + return Objects.equals(field, other.field) && Objects.equals(wildcardPattern, other.wildcardPattern); + } + + @Override + public int hashCode() { + return Objects.hash(field, wildcardPattern); + } + +} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnDvQuery.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnDvQuery.java new file mode 100644 index 0000000000000..596f2e962b7be --- /dev/null +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnDvQuery.java @@ -0,0 +1,99 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreScorer; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunAutomaton; + +import java.io.IOException; +import java.util.Objects; + +/** + * Query that runs a wildcard pattern across all doc values. + * Expensive to run so normally used in conjunction with more selective query clauses. + */ +public class WildcardOnDvQuery extends Query { + + private final String field; + private final String wildcardPattern; + + public WildcardOnDvQuery(String field, String wildcardPattern) { + this.field = field; + this.wildcardPattern = wildcardPattern; + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + + Automaton automaton = WildcardQuery.toAutomaton(new Term(field,wildcardPattern)); + ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton); + + return new ConstantScoreWeight(this, boost) { + + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + SortedSetDocValues values = DocValues.getSortedSet(context.reader(), field); + TwoPhaseIterator twoPhase = new TwoPhaseIterator(values) { + @Override + public boolean matches() throws IOException { + long ord = values.nextOrd(); + while (ord != SortedSetDocValues.NO_MORE_ORDS) { + BytesRef value = values.lookupOrd(ord); + if (bytesMatcher.run(value.bytes, value.offset, value.length)) { + return true; + } + ord = values.nextOrd(); + } + return false; + } + + @Override + public float matchCost() { + // TODO: how can we compute this? + return 1000f; + } + }; + return new ConstantScoreScorer(this, score(), scoreMode, twoPhase); + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return true; + } + }; + } + @Override + public String toString(String field) { + return field+":"+wildcardPattern; + } + + @Override + public boolean equals(Object obj) { + WildcardOnDvQuery other = (WildcardOnDvQuery) obj; + return Objects.equals(field, other.field) && Objects.equals(wildcardPattern, other.wildcardPattern); + } + + @Override + public int hashCode() { + return Objects.hash(field, wildcardPattern); + } + +} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardPositionBasedQuery.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardPositionBasedQuery.java new file mode 100644 index 0000000000000..8dd2032726550 --- /dev/null +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardPositionBasedQuery.java @@ -0,0 +1,203 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queries.XIntervals; +import org.apache.lucene.queries.intervals.IntervalQuery; +import org.apache.lucene.queries.intervals.Intervals; +import org.apache.lucene.queries.intervals.IntervalsSource; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper.WildcardFieldType.PatternStructure; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Objects; + +public class WildcardPositionBasedQuery extends Query{ + + private final String field; + private final PatternStructure wildcardPattern; + private final int numChars; + + public WildcardPositionBasedQuery(String field, PatternStructure wildcardPattern, int numChars) { + this.field = field; + this.wildcardPattern = wildcardPattern; + this.numChars = numChars; + } + + + private void addFragment(ArrayList spans, int fragmentNum, IntervalsSource newFragment) { + // TODO mixtures of ? and * in a sequence e.g. aa?* are problematic because we only have the max gap + //(Integer.maxInt in this case) and not the min (which is 1 in this example). + int precedingGapSize = wildcardPattern.getPrecedingGapSize(fragmentNum); + if ( precedingGapSize >0 && precedingGapSize < Integer.MAX_VALUE) { + if (spans.size() == 0) { + IntervalsSource wildcard = XIntervals.wildcard(new BytesRef(WildcardFieldMapper.TOKEN_START_OR_END_CHAR + "*")); + IntervalsSource addedGap = Intervals.extend(wildcard, 0, precedingGapSize); + IntervalsSource phrase = Intervals.phrase(addedGap, newFragment); + spans.add(phrase); + return; + } + IntervalsSource lastFragment = spans.get(spans.size()-1); + + IntervalsSource addedGap = Intervals.extend(lastFragment, 0, precedingGapSize); + IntervalsSource phrase = Intervals.phrase(addedGap, newFragment); + spans.set(spans.size()-1, phrase); + + + } else { + spans.add(newFragment); + } + } + + + + + private String escapeWildcards(String s) { + s = s.replace(Character.toString(WildcardQuery.WILDCARD_CHAR), + Character.toString(WildcardQuery.WILDCARD_ESCAPE)+Character.toString(WildcardQuery.WILDCARD_CHAR)); + s = s.replace(Character.toString(WildcardQuery.WILDCARD_STRING), + Character.toString(WildcardQuery.WILDCARD_ESCAPE)+Character.toString(WildcardQuery.WILDCARD_STRING)); + return s; + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + + + ArrayList spans = new ArrayList<>(); + + + for (int i = 0; i < wildcardPattern.fragments.length; i++) { + String fragment = wildcardPattern.fragments[i]; + int fLength = fragment.length(); + if (fLength == 0) { + continue; + } + + // Add any start/end of string character + if (i == 0 && wildcardPattern.openStart == false) { + // Start-of-string anchored (is not a leading wildcard) + fragment = WildcardFieldMapper.TOKEN_START_OR_END_CHAR + fragment; + } + if (wildcardPattern.openEnd == false && i == wildcardPattern.fragments.length - 1) { + // End-of-string anchored (is not a trailing wildcard) + fragment = fragment + WildcardFieldMapper.TOKEN_START_OR_END_CHAR; + } + + if (fragment.length() == numChars) { + IntervalsSource addedGap = Intervals.extend(Intervals.term(fragment), 0, fragment.length()-1); + addFragment(spans, i, addedGap); + } else if (fragment.length() > numChars) { + // Break fragment into multiple Ngrams + + + ArrayList fragmentRun = new ArrayList<>(); + KeywordTokenizer kt = new KeywordTokenizer(256); + kt.setReader(new StringReader(fragment)); + TokenFilter filter = new NGramTokenFilter(kt, numChars, numChars, false); + CharTermAttribute termAtt = filter.addAttribute(CharTermAttribute.class); + try { + filter.reset(); + int nextRequiredCoverage = 0; + int charPos = 0; + int endOfOptimisableSection = fragment.length() - (numChars * 2); + + while (filter.incrementToken()) { + if (charPos < endOfOptimisableSection) { + if (charPos == nextRequiredCoverage) { + IntervalsSource iTerm = Intervals.term(termAtt.toString()); + fragmentRun.add(Intervals.extend(iTerm, 0, termAtt.length() - 1)); + // optimise - skip unnecessary overlapping tokens + nextRequiredCoverage = charPos + termAtt.length(); + } + } else { + // We are into the tail of the string that can't be optimised by skipping + if (charPos >= nextRequiredCoverage) { + fragmentRun.add(Intervals.term(termAtt.toString())); + if (charPos + termAtt.length() >= fragment.length()) { + // we've achieved full coverage of the pattern now + break; + } + } + } + charPos++; + } + kt.end(); + kt.close(); + } catch(IOException ioe) { + throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment ["+fragment+"]"); + } + + IntervalsSource phrase = Intervals.phrase(fragmentRun.toArray(new IntervalsSource[0])); + IntervalsSource addedGap = Intervals.extend(phrase, 0, numChars - 1); + addFragment(spans, i, addedGap); + + } else { + // fragment is smaller than smallest ngram size + if (wildcardPattern.openEnd || i < wildcardPattern.fragments.length - 1) { + // fragment occurs mid-string so will need a wildcard query + IntervalsSource wildcard = XIntervals.wildcard(new BytesRef(escapeWildcards(fragment) + "*")); + + IntervalsSource addedGap = Intervals.extend(wildcard, 0, fragment.length()-1); + addFragment(spans, i, addedGap); + } else { + // fragment occurs at end of string so can rely on Jim's indexing rule to optimise + // *foo by indexing smaller ngrams at the end of a string + IntervalsSource addedGap = Intervals.extend(Intervals.term(fragment), 0, fragment.length()-1); + addFragment(spans, i, addedGap); + } + } + } + + if (wildcardPattern.lastGap > 0 && wildcardPattern.lastGap < Integer.MAX_VALUE) { + IntervalsSource lastFragment = spans.get(spans.size() - 1); + IntervalsSource addedGap = Intervals.extend(lastFragment, 0, wildcardPattern.lastGap); + IntervalsSource fieldEnd = Intervals.term(new BytesRef("" + WildcardFieldMapper.TOKEN_START_OR_END_CHAR)); + IntervalsSource phrase = Intervals.phrase(addedGap, fieldEnd); + spans.set(spans.size() - 1, phrase); + } + + if (spans.size()==1) { + IntervalQuery iq = new IntervalQuery(field, spans.get(0)); + return iq; + } + + IntervalQuery iq = new IntervalQuery(field, Intervals.ordered(spans.toArray(new IntervalsSource[0]))); + return iq; + } + + @Override + public boolean equals(Object obj) { + WildcardPositionBasedQuery other = (WildcardPositionBasedQuery) obj; + return Objects.equals(field, other.field) && Objects.equals(wildcardPattern, other.wildcardPattern) && + Objects.equals(numChars, other.numChars); + } + + @Override + public int hashCode() { + return Objects.hash(field, wildcardPattern, numChars); + } + + + + + @Override + public String toString(String field) { + return field + ":" + wildcardPattern.pattern; + } + +} diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardFieldTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardFieldTests.java new file mode 100644 index 0000000000000..8a81fca7a6b7e --- /dev/null +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardFieldTests.java @@ -0,0 +1,233 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.wildcard; + + +import org.apache.lucene.search.WildcardQuery; +import org.elasticsearch.action.index.IndexRequestBuilder; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.query.WildcardQueryBuilder; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin; +import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse; +import static org.hamcrest.Matchers.equalTo; + +@ESIntegTestCase.SuiteScopeTestCase +public class WildcardFieldTests extends ESIntegTestCase { + + static final int MAX_FIELD_LENGTH = 100; + static final String TOO_BIG_PREFIX = "toobig"; + static String randomABString(int minLength) { + StringBuilder sb = new StringBuilder(); + while (sb.length() < minLength) { + if (randomBoolean()) { + sb.append("a"); + } else { + sb.append("b"); + } + } + return sb.toString(); + } + + @Override + public void setupSuiteScopeCluster() throws Exception { + + assertAcked(prepareCreate("idx").setMapping( buildMapping()).get()); + + List builders = new ArrayList<>(); + int numDocs = 100; + for (int i = 0; i < numDocs; i++) { + // TODO An issue with array-based matching ... https://github.com/elastic/elasticsearch/pull/49993#issuecomment-567531755 +// if(i%10==0) { + if(false) { + // One in ten docs use an array, not a single value + String data1 = randomABString(1+randomInt(MAX_FIELD_LENGTH)); + String data2 = randomABString(1+randomInt(MAX_FIELD_LENGTH)); + XContentBuilder source = jsonBuilder().startObject().array("data", data1, data2) + .endObject(); + builders.add(client().prepareIndex("idx").setId("" + i).setSource(source)); + } else { + String data = randomABString(1+randomInt(MAX_FIELD_LENGTH)); + XContentBuilder source = jsonBuilder().startObject().field("data", data).endObject(); + builders.add(client().prepareIndex("idx").setId("" + i).setSource(source)); + } + } + // Add a doc to test ignore_above + String data = TOO_BIG_PREFIX + randomABString(MAX_FIELD_LENGTH); + XContentBuilder source = jsonBuilder().startObject().field("data", data).endObject(); + builders.add(client().prepareIndex("idx").setSource(source)); + + indexRandom(true, builders); + ensureSearchable(); + } + + public static Map buildMapping() { + Map fields = new HashMap<>(); + + Map rootFieldDef = new HashMap<>(); + fields.put("data", rootFieldDef); + { + rootFieldDef.put("type", WildcardFieldMapper.CONTENT_TYPE); + rootFieldDef.put("ignore_above", MAX_FIELD_LENGTH); + Map subFields = new HashMap<>(); + rootFieldDef.put("fields", subFields); + { + Map subFieldDef1 = new HashMap<>(); + subFields.put("asKeyword", subFieldDef1); + { + subFieldDef1.put("type", "keyword"); + subFieldDef1.put("ignore_above", MAX_FIELD_LENGTH); + } + Map subFieldDef2 = new HashMap<>(); + subFields.put("asPosWildcard", subFieldDef2); + { + subFieldDef2.put("type", WildcardFieldMapper.CONTENT_TYPE); + subFieldDef2.put("ignore_above", MAX_FIELD_LENGTH); + subFieldDef2.put("match_type", WildcardFieldMapper.Defaults.MATCH_TYPE_POSITION); + } + Map subFieldDef3 = new HashMap<>(); + subFields.put("asBinaryDV", subFieldDef3); + { + subFieldDef3.put("type", WildcardFieldMapper.CONTENT_TYPE); + subFieldDef3.put("ignore_above", MAX_FIELD_LENGTH); + subFieldDef3.put("match_type", WildcardFieldMapper.Defaults.MATCH_TYPE_BINARY_DOC_VALUES); + } + } + } + return Collections.singletonMap("properties", fields); + } + + @Override + protected Collection> nodePlugins() { + List> plugins = new ArrayList<>(); + plugins.add(Wildcard.class); + plugins.add(LocalStateCompositeXPackPlugin.class); + return plugins; + } + + + public void testKeywordAndWildcardSearchesConcur() throws Exception { + + int numSearches = 100; + for (int i = 0; i < numSearches; i++) { + String randomWildcardPattern = getRandomWildcardPattern(); + SearchResponse wildcardResponse = client().prepareSearch("idx") + .setQuery(new WildcardQueryBuilder("data", randomWildcardPattern)) + .get(); + + assertSearchResponse(wildcardResponse); + + SearchResponse keywordResponse = client().prepareSearch("idx") + .setQuery(new WildcardQueryBuilder("data.asKeyword", randomWildcardPattern)) + .get(); + assertSearchResponse(keywordResponse); + assertThat(wildcardResponse.getHits().getTotalHits().value, equalTo(keywordResponse.getHits().getTotalHits().value)); + + } + } + + public void testKeywordAndBInaryDvWildcardSearchesConcur() throws Exception { + + int numSearches = 100; + for (int i = 0; i < numSearches; i++) { + String randomWildcardPattern = getRandomWildcardPattern(); + SearchResponse wildcardResponse = client().prepareSearch("idx") + .setQuery(new WildcardQueryBuilder("data.asBinaryDV", randomWildcardPattern)) + .get(); + + assertSearchResponse(wildcardResponse); + + SearchResponse keywordResponse = client().prepareSearch("idx") + .setQuery(new WildcardQueryBuilder("data.asKeyword", randomWildcardPattern)) + .get(); + assertSearchResponse(keywordResponse); + assertThat(wildcardResponse.getHits().getTotalHits().value, equalTo(keywordResponse.getHits().getTotalHits().value)); + + } + } + + + public void testWildcardPosAndKeywordSearchesConcur() throws Exception { + + int numSearches = 100; + for (int i = 0; i < numSearches; i++) { + String randomWildcardPattern = getRandomWildcardPattern(); + SearchResponse wildcardResponse = client().prepareSearch("idx") + .setQuery(new WildcardQueryBuilder("data.asPosWildcard", randomWildcardPattern)) + .get(); + + assertSearchResponse(wildcardResponse); + + SearchResponse keywordResponse = client().prepareSearch("idx") + .setQuery(new WildcardQueryBuilder("data.asKeyword", randomWildcardPattern)) + .get(); + assertSearchResponse(keywordResponse); + assertThat(wildcardResponse.getHits().getTotalHits().value, equalTo(keywordResponse.getHits().getTotalHits().value)); + + } + } + + public void testIgnoreAbove() throws Exception { + SearchResponse wildcardResponse = client().prepareSearch("idx").setQuery(new WildcardQueryBuilder("data", TOO_BIG_PREFIX + "*")) + .get(); + assertSearchResponse(wildcardResponse); + assertThat(wildcardResponse.getHits().getTotalHits().value, equalTo(0L)); + } + + private void randomSyntaxChar(StringBuilder sb) { + switch (randomInt(3)) { + case 0: + sb.append(WildcardQuery.WILDCARD_CHAR); + break; + case 1: + sb.append(WildcardQuery.WILDCARD_STRING); + break; + case 2: + sb.append(WildcardQuery.WILDCARD_ESCAPE); + sb.append(WildcardQuery.WILDCARD_STRING); + break; + case 3: + sb.append(WildcardQuery.WILDCARD_ESCAPE); + sb.append(WildcardQuery.WILDCARD_CHAR); + break; + } + } + + private String getRandomWildcardPattern() { + StringBuilder sb = new StringBuilder(); + + int numFragments = 1+randomInt(4); + + if(randomInt(10)==1) { + randomSyntaxChar(sb); + } + for (int i = 0; i < numFragments; i++) { + if(i>0) { + randomSyntaxChar(sb); + } + sb.append(randomABString(1+randomInt(6))); + } + if(randomInt(10)==1) { + randomSyntaxChar(sb); + } + return sb.toString(); + } + +} diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportActionTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportActionTests.java new file mode 100644 index 0000000000000..791afeb57c090 --- /dev/null +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportActionTests.java @@ -0,0 +1,53 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.wildcard; + +import org.elasticsearch.action.support.ActionFilters; +import org.elasticsearch.action.support.PlainActionFuture; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.license.XPackLicenseState; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.transport.TransportService; +import org.elasticsearch.xpack.core.XPackFeatureSet; +import org.elasticsearch.xpack.core.action.XPackUsageFeatureResponse; +import org.elasticsearch.xpack.core.wildcard.WildcardFeatureSetUsage; +import org.junit.Before; + +import static org.hamcrest.core.Is.is; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class WildcardInfoTransportActionTests extends ESTestCase { + + private XPackLicenseState licenseState; + + @Before + public void init() { + licenseState = mock(XPackLicenseState.class); + } + + public void testAvailable() throws Exception { + WildcardInfoTransportAction featureSet = new WildcardInfoTransportAction( + mock(TransportService.class), mock(ActionFilters.class), Settings.EMPTY, licenseState); + boolean available = randomBoolean(); + when(licenseState.isWildcardAllowed()).thenReturn(available); + assertThat(featureSet.available(), is(available)); + + var usageAction = new WildcardUsageTransportAction(mock(TransportService.class), null, null, + mock(ActionFilters.class), null, Settings.EMPTY, licenseState); + PlainActionFuture future = new PlainActionFuture<>(); + usageAction.masterOperation(null, null, null, future); + XPackFeatureSet.Usage usage = future.get().getUsage(); + assertThat(usage.available(), is(available)); + + BytesStreamOutput out = new BytesStreamOutput(); + usage.writeTo(out); + XPackFeatureSet.Usage serializedUsage = new WildcardFeatureSetUsage(out.bytes().streamInput()); + assertThat(serializedUsage.available(), is(available)); + } + +} diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilterTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilterTests.java new file mode 100644 index 0000000000000..345c5b3aabb72 --- /dev/null +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilterTests.java @@ -0,0 +1,77 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.io.StringReader; + +import static org.hamcrest.Matchers.equalTo; + +public class TaperedNgramTokenFilterTests extends ESTestCase { + + + public void testLongString() { + checkTokens("Hello world", 6, "Hello ", "ello w", "llo wo", "lo wor", "o worl", " world", "world", "orld", "rld", "ld", "d"); + } + + public void testShortString() { + checkTokens("Hello", 5, "Hello", "ello", "llo", "lo", "o"); + } + + public void testSingleCharDoc() { + checkTokens("H", 5, "H"); + } + + public void testSingleCharNgram() { + checkTokens("Hello", 1, "H", "e", "l", "l", "o"); + } + + public void testFieldMapperEncoding() { + char TOKEN_START_OR_END_CHAR = 0; + checkTokens(TOKEN_START_OR_END_CHAR+"aaa"+TOKEN_START_OR_END_CHAR, 5, + TOKEN_START_OR_END_CHAR+"aaa"+TOKEN_START_OR_END_CHAR, + "aaa"+TOKEN_START_OR_END_CHAR, + "aa"+TOKEN_START_OR_END_CHAR, + "a"+TOKEN_START_OR_END_CHAR, + ""+TOKEN_START_OR_END_CHAR); + } + + public void testTooShortNgram() { + Exception expectedException = expectThrows(IllegalArgumentException.class, () -> checkTokens("Hello", 0, "")); + assertThat(expectedException.getMessage(), equalTo("maxGram must be greater than zero")); + + } + + private static void checkTokens(String value, int ngramLength, String... expectedTokens) { + KeywordTokenizer kt = new KeywordTokenizer(256); + kt.setReader(new StringReader(value)); + TokenFilter filter = new TaperedNgramTokenFilter(kt, ngramLength); + CharTermAttribute termAtt = filter.addAttribute(CharTermAttribute.class); + int tokPos = 0; + try { + filter.reset(); + while (filter.incrementToken()) { + String expectedToken = expectedTokens[tokPos++]; + String actualToken = termAtt.toString(); + assertEquals(expectedToken, actualToken); + } + kt.end(); + kt.close(); + } catch (IOException ioe) { + throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment [" + value + "]"); + } + assertEquals(expectedTokens.length, tokPos); + } + +} diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java new file mode 100644 index 0000000000000..95695e1a8a1c6 --- /dev/null +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -0,0 +1,168 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.store.Directory; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.mapper.ContentPath; +import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper.Builder; +import org.junit.Before; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; + +import static org.hamcrest.Matchers.equalTo; + +public class WildcardFieldMapperTests extends ESTestCase { + + private static final String KEYWORD_FIELD_NAME = "keyword_field"; + private static final String WILDCARD_FIELD_NAME = "wildcard_field"; + static WildcardFieldMapper wildcardFieldType; + + @Override + @Before + public void setUp() throws Exception { + Builder builder = new WildcardFieldMapper.Builder(WILDCARD_FIELD_NAME); + wildcardFieldType = builder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0))); + super.setUp(); + } + + public void testVersusKeywordField() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setMergePolicy(newTieredMergePolicy(random())); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + int numDocs = 100; + HashSet values = new HashSet<>(); + for (int i = 0; i < numDocs; i++) { + String docContent = randomABString(1 + randomInt(MAX_FIELD_LENGTH)); + if (values.contains(docContent) == false) { + createDocs(docContent, iw); + values.add(docContent); + } + } + + iw.forceMerge(1); + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + int numSearches = 100; + for (int i = 0; i < numSearches; i++) { + String randomWildcardPattern = getRandomWildcardPattern(); + + Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(randomWildcardPattern, null, null); + TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.INDEXORDER); + + Query keywordFieldQuery = new WildcardQuery(new Term(KEYWORD_FIELD_NAME, randomWildcardPattern)); + TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.INDEXORDER); + + assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value)); + + HashSet expectedDocs = new HashSet<>(); + for (ScoreDoc topDoc : kwTopDocs.scoreDocs) { + expectedDocs.add(topDoc.doc); + } + for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) { + assertTrue(expectedDocs.remove(wcTopDoc.doc)); + } + assertThat(expectedDocs.size(), equalTo(0)); + } + reader.close(); + dir.close(); + } + + private void createDocs(String docContent, RandomIndexWriter iw) throws IOException { + ArrayList fields = new ArrayList<>(); + wildcardFieldType.createFields(docContent, fields); + Document doc = new Document(); + for (IndexableField indexableField : fields) { + doc.add(indexableField); + } + doc.add(new StringField(KEYWORD_FIELD_NAME, docContent, Field.Store.YES)); + iw.addDocument(doc); + } + + protected IndexSettings createIndexSettings() { + return new IndexSettings( + IndexMetaData.builder("_index").settings(Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)) + .numberOfShards(1).numberOfReplicas(0).creationDate(System.currentTimeMillis()).build(), + Settings.EMPTY); + } + + static final int MAX_FIELD_LENGTH = 100; + + static String randomABString(int minLength) { + StringBuilder sb = new StringBuilder(); + while (sb.length() < minLength) { + if (randomBoolean()) { + sb.append("a"); + } else { + sb.append("b"); + } + } + return sb.toString(); + } + + private void randomSyntaxChar(StringBuilder sb) { + switch (randomInt(3)) { + case 0: + sb.append(WildcardQuery.WILDCARD_CHAR); + break; + case 1: + sb.append(WildcardQuery.WILDCARD_STRING); + break; + case 2: + sb.append(WildcardQuery.WILDCARD_ESCAPE); + sb.append(WildcardQuery.WILDCARD_STRING); + break; + case 3: + sb.append(WildcardQuery.WILDCARD_ESCAPE); + sb.append(WildcardQuery.WILDCARD_CHAR); + break; + } + } + + private String getRandomWildcardPattern() { + StringBuilder sb = new StringBuilder(); + int numFragments = 1 + randomInt(4); + if (randomInt(10) == 1) { + randomSyntaxChar(sb); + } + for (int i = 0; i < numFragments; i++) { + if (i > 0) { + randomSyntaxChar(sb); + } + sb.append(randomABString(1 + randomInt(6))); + } + if (randomInt(10) == 1) { + randomSyntaxChar(sb); + } + return sb.toString(); + } +} diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java new file mode 100644 index 0000000000000..ff5c19ed2df6c --- /dev/null +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java @@ -0,0 +1,19 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.elasticsearch.index.mapper.FieldTypeTestCase; +import org.elasticsearch.index.mapper.MappedFieldType; + +public class WildcardFieldTypeTests extends FieldTypeTestCase { + + @Override + protected MappedFieldType createDefaultFieldType() { + return new WildcardFieldMapper.WildcardFieldType(); + } +} From d15571b394166cf3ccb60a7014b6ee3c2b9597ec Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 24 Feb 2020 15:53:17 +0000 Subject: [PATCH 02/32] Added docs, added illegal argument checking to test --- docs/reference/mapping/types.asciidoc | 4 +- .../reference/mapping/types/wildcard.asciidoc | 53 +++++++++++++++++++ .../mapper/WildcardFieldMapperTests.java | 19 ++++++- 3 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 docs/reference/mapping/types/wildcard.asciidoc diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc index 1b5afad31f0c6..d26d897474deb 100644 --- a/docs/reference/mapping/types.asciidoc +++ b/docs/reference/mapping/types.asciidoc @@ -7,7 +7,7 @@ document: [float] === Core datatypes -string:: <> and <> +string:: <>, <> and <> <>:: `long`, `integer`, `short`, `byte`, `double`, `float`, `half_float`, `scaled_float` <>:: `date` <>:: `date_nanos` @@ -131,3 +131,5 @@ include::types/token-count.asciidoc[] include::types/shape.asciidoc[] include::types/constant-keyword.asciidoc[] + +include::types/wildcard.asciidoc[] diff --git a/docs/reference/mapping/types/wildcard.asciidoc b/docs/reference/mapping/types/wildcard.asciidoc new file mode 100644 index 0000000000000..4d64febb9633a --- /dev/null +++ b/docs/reference/mapping/types/wildcard.asciidoc @@ -0,0 +1,53 @@ +[role="xpack"] +[testenv="basic"] +[[wildcard]] +=== Wildcard datatype +++++ +Wildcard +++++ + +A `wildcard` field stores values optimised for wildcard queries. +Wildcard queries are possible on other field types but suffer from constraints: +* `text` fields limit matching of any wildcard expressions to individual tokens rather than the original whole value held in a field +* `keyword` fields are untokenized but slow at performing wildcard queries (especially patterns with leading wildcards). + +Internally the `wildcard` field indexes the whole field value using ngrams and stores the full string in compressed chunks. +The index is used as a rough filter to cut down the number of values that are then checked by retrieving and checking the full values from the compressed store. +Storage costs are typically lower than those of `keyword` fields + +You index and search a wildcard field as follows + +[source,console] +-------------------------------------------------- +PUT my_index +{ + "mappings": { + "properties": { + "my_wildcard": { + "type": "wildcard" + } + } + } +} + +PUT my_index/_doc/1 +{ + "my_wildcard" : "This string can be quite lengthy" +} + +GET my_index/_doc/_search +{ + "query": { + "wildcard" : "*quite*lengthy" + } +} + + +-------------------------------------------------- + + +==== Limitations + +* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries. +* `wildcard` fields cannot be used as a value source in aggregations such as the `terms` aggregation. + diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java index 95695e1a8a1c6..7f9b9ac3e4c22 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -27,6 +27,7 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.mapper.ContentPath; import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper.Builder; import org.junit.Before; @@ -50,8 +51,22 @@ public void setUp() throws Exception { wildcardFieldType = builder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0))); super.setUp(); } - - public void testVersusKeywordField() throws IOException { + + public void testIllegalDocValuesArgument() { + Builder ft = new WildcardFieldMapper.Builder("test"); + MapperParsingException e = expectThrows(MapperParsingException.class, + () -> ft.docValues(false)); + assertEquals("The field [test] cannot have doc values = false", e.getMessage()); + } + + public void testIllegalIndexedArgument() { + Builder ft = new WildcardFieldMapper.Builder("test"); + MapperParsingException e = expectThrows(MapperParsingException.class, + () -> ft.index(false)); + assertEquals("The field [test] cannot have index = false", e.getMessage()); + } + + public void testSearchResultsVersusKeywordField() throws IOException { Directory dir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(); iwc.setMergePolicy(newTieredMergePolicy(random())); From 42e55bc90e86c7ddf7c13eeaf1a83b821f6bdfc5 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 24 Feb 2020 16:32:33 +0000 Subject: [PATCH 03/32] Docs change --- docs/reference/mapping/types.asciidoc | 2 +- .../{wildcard.asciidoc => wildcard-keyword.asciidoc} | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) rename docs/reference/mapping/types/{wildcard.asciidoc => wildcard-keyword.asciidoc} (68%) diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc index d26d897474deb..509dda6fe690a 100644 --- a/docs/reference/mapping/types.asciidoc +++ b/docs/reference/mapping/types.asciidoc @@ -132,4 +132,4 @@ include::types/shape.asciidoc[] include::types/constant-keyword.asciidoc[] -include::types/wildcard.asciidoc[] +include::types/wildcard-keyword.asciidoc[] diff --git a/docs/reference/mapping/types/wildcard.asciidoc b/docs/reference/mapping/types/wildcard-keyword.asciidoc similarity index 68% rename from docs/reference/mapping/types/wildcard.asciidoc rename to docs/reference/mapping/types/wildcard-keyword.asciidoc index 4d64febb9633a..7479417f489f3 100644 --- a/docs/reference/mapping/types/wildcard.asciidoc +++ b/docs/reference/mapping/types/wildcard-keyword.asciidoc @@ -1,17 +1,17 @@ [role="xpack"] [testenv="basic"] -[[wildcard]] +[[wildcard-keyword]] === Wildcard datatype ++++ Wildcard ++++ -A `wildcard` field stores values optimised for wildcard queries. +A `wildcard_keyword` field stores values optimised for wildcard queries. Wildcard queries are possible on other field types but suffer from constraints: * `text` fields limit matching of any wildcard expressions to individual tokens rather than the original whole value held in a field * `keyword` fields are untokenized but slow at performing wildcard queries (especially patterns with leading wildcards). -Internally the `wildcard` field indexes the whole field value using ngrams and stores the full string in compressed chunks. +Internally the `wildcard_keyword` field indexes the whole field value using ngrams and stores the full string in compressed chunks. The index is used as a rough filter to cut down the number of values that are then checked by retrieving and checking the full values from the compressed store. Storage costs are typically lower than those of `keyword` fields @@ -24,7 +24,7 @@ PUT my_index "mappings": { "properties": { "my_wildcard": { - "type": "wildcard" + "type": "wildcard_keyword" } } } @@ -48,6 +48,6 @@ GET my_index/_doc/_search ==== Limitations -* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries. -* `wildcard` fields cannot be used as a value source in aggregations such as the `terms` aggregation. +* `wildcard_keyword` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries. +* `wildcard_keyword` fields cannot be used as a value source in aggregations such as the `terms` aggregation. From c7714a0a1b350b9d9348c609f085aed83872f2c2 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 24 Feb 2020 16:34:49 +0000 Subject: [PATCH 04/32] Docs change --- docs/reference/mapping/types/wildcard-keyword.asciidoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/reference/mapping/types/wildcard-keyword.asciidoc b/docs/reference/mapping/types/wildcard-keyword.asciidoc index 7479417f489f3..542a903a9e5fe 100644 --- a/docs/reference/mapping/types/wildcard-keyword.asciidoc +++ b/docs/reference/mapping/types/wildcard-keyword.asciidoc @@ -1,9 +1,9 @@ [role="xpack"] [testenv="basic"] [[wildcard-keyword]] -=== Wildcard datatype +=== Wildcard keyword datatype ++++ -Wildcard +Wildcard keyword ++++ A `wildcard_keyword` field stores values optimised for wildcard queries. From 67c8a573cdb64841442e36335ca23dee801d6434 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 24 Feb 2020 16:46:21 +0000 Subject: [PATCH 05/32] Docs change --- docs/reference/mapping/types.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc index 509dda6fe690a..1082a121c5e3a 100644 --- a/docs/reference/mapping/types.asciidoc +++ b/docs/reference/mapping/types.asciidoc @@ -7,7 +7,7 @@ document: [float] === Core datatypes -string:: <>, <> and <> +string:: <>, <> and <> <>:: `long`, `integer`, `short`, `byte`, `double`, `float`, `half_float`, `scaled_float` <>:: `date` <>:: `date_nanos` From dac8408c058e9dba0bcfa960a7df835c388484a5 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 24 Feb 2020 16:55:26 +0000 Subject: [PATCH 06/32] Doc change --- docs/reference/mapping/types.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc index 1082a121c5e3a..2d10e8d92cc3e 100644 --- a/docs/reference/mapping/types.asciidoc +++ b/docs/reference/mapping/types.asciidoc @@ -7,7 +7,7 @@ document: [float] === Core datatypes -string:: <>, <> and <> +string:: <>, <> and <> <>:: `long`, `integer`, `short`, `byte`, `double`, `float`, `half_float`, `scaled_float` <>:: `date` <>:: `date_nanos` From 8f7f0a49dbfed67ca4e1a3433cb61b7c028800ba Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 24 Feb 2020 17:09:02 +0000 Subject: [PATCH 07/32] Docs fix --- docs/reference/mapping/types/wildcard-keyword.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/mapping/types/wildcard-keyword.asciidoc b/docs/reference/mapping/types/wildcard-keyword.asciidoc index 542a903a9e5fe..d965466d4ca37 100644 --- a/docs/reference/mapping/types/wildcard-keyword.asciidoc +++ b/docs/reference/mapping/types/wildcard-keyword.asciidoc @@ -35,7 +35,7 @@ PUT my_index/_doc/1 "my_wildcard" : "This string can be quite lengthy" } -GET my_index/_doc/_search +POST my_index/_doc/_search { "query": { "wildcard" : "*quite*lengthy" From dddd1aefae846b6ac0799a33d3720962e55c19ee Mon Sep 17 00:00:00 2001 From: markharwood Date: Tue, 25 Feb 2020 10:02:20 +0000 Subject: [PATCH 08/32] Remove redundant inter test, add ignore_above test to unit test --- .../wildcard/mapper/WildcardFieldMapper.java | 8 +- .../xpack/wildcard/WildcardFieldTests.java | 233 ------------------ .../mapper/WildcardFieldMapperTests.java | 32 ++- 3 files changed, 32 insertions(+), 241 deletions(-) delete mode 100644 x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardFieldTests.java diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 8fd16f40b83e3..b151819ba8f9d 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -539,15 +539,13 @@ protected void parseCreateField(ParseContext context, List field value = parser.textOrNull(); } } - - if (value == null || value.length() > ignoreAbove) { - return; - } - createFields(value, fields); } void createFields(String value, Listfields) { + if (value == null || value.length() > ignoreAbove) { + return; + } KeywordTokenizer kt = new KeywordTokenizer(256); kt.setReader(new StringReader(TOKEN_START_OR_END_CHAR+ value +TOKEN_START_OR_END_CHAR)); TokenFilter filter = new TaperedNgramTokenFilter(kt, fieldType().numChars); diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardFieldTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardFieldTests.java deleted file mode 100644 index 8a81fca7a6b7e..0000000000000 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardFieldTests.java +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ - -package org.elasticsearch.xpack.wildcard; - - -import org.apache.lucene.search.WildcardQuery; -import org.elasticsearch.action.index.IndexRequestBuilder; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.index.query.WildcardQueryBuilder; -import org.elasticsearch.plugins.Plugin; -import org.elasticsearch.test.ESIntegTestCase; -import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin; -import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse; -import static org.hamcrest.Matchers.equalTo; - -@ESIntegTestCase.SuiteScopeTestCase -public class WildcardFieldTests extends ESIntegTestCase { - - static final int MAX_FIELD_LENGTH = 100; - static final String TOO_BIG_PREFIX = "toobig"; - static String randomABString(int minLength) { - StringBuilder sb = new StringBuilder(); - while (sb.length() < minLength) { - if (randomBoolean()) { - sb.append("a"); - } else { - sb.append("b"); - } - } - return sb.toString(); - } - - @Override - public void setupSuiteScopeCluster() throws Exception { - - assertAcked(prepareCreate("idx").setMapping( buildMapping()).get()); - - List builders = new ArrayList<>(); - int numDocs = 100; - for (int i = 0; i < numDocs; i++) { - // TODO An issue with array-based matching ... https://github.com/elastic/elasticsearch/pull/49993#issuecomment-567531755 -// if(i%10==0) { - if(false) { - // One in ten docs use an array, not a single value - String data1 = randomABString(1+randomInt(MAX_FIELD_LENGTH)); - String data2 = randomABString(1+randomInt(MAX_FIELD_LENGTH)); - XContentBuilder source = jsonBuilder().startObject().array("data", data1, data2) - .endObject(); - builders.add(client().prepareIndex("idx").setId("" + i).setSource(source)); - } else { - String data = randomABString(1+randomInt(MAX_FIELD_LENGTH)); - XContentBuilder source = jsonBuilder().startObject().field("data", data).endObject(); - builders.add(client().prepareIndex("idx").setId("" + i).setSource(source)); - } - } - // Add a doc to test ignore_above - String data = TOO_BIG_PREFIX + randomABString(MAX_FIELD_LENGTH); - XContentBuilder source = jsonBuilder().startObject().field("data", data).endObject(); - builders.add(client().prepareIndex("idx").setSource(source)); - - indexRandom(true, builders); - ensureSearchable(); - } - - public static Map buildMapping() { - Map fields = new HashMap<>(); - - Map rootFieldDef = new HashMap<>(); - fields.put("data", rootFieldDef); - { - rootFieldDef.put("type", WildcardFieldMapper.CONTENT_TYPE); - rootFieldDef.put("ignore_above", MAX_FIELD_LENGTH); - Map subFields = new HashMap<>(); - rootFieldDef.put("fields", subFields); - { - Map subFieldDef1 = new HashMap<>(); - subFields.put("asKeyword", subFieldDef1); - { - subFieldDef1.put("type", "keyword"); - subFieldDef1.put("ignore_above", MAX_FIELD_LENGTH); - } - Map subFieldDef2 = new HashMap<>(); - subFields.put("asPosWildcard", subFieldDef2); - { - subFieldDef2.put("type", WildcardFieldMapper.CONTENT_TYPE); - subFieldDef2.put("ignore_above", MAX_FIELD_LENGTH); - subFieldDef2.put("match_type", WildcardFieldMapper.Defaults.MATCH_TYPE_POSITION); - } - Map subFieldDef3 = new HashMap<>(); - subFields.put("asBinaryDV", subFieldDef3); - { - subFieldDef3.put("type", WildcardFieldMapper.CONTENT_TYPE); - subFieldDef3.put("ignore_above", MAX_FIELD_LENGTH); - subFieldDef3.put("match_type", WildcardFieldMapper.Defaults.MATCH_TYPE_BINARY_DOC_VALUES); - } - } - } - return Collections.singletonMap("properties", fields); - } - - @Override - protected Collection> nodePlugins() { - List> plugins = new ArrayList<>(); - plugins.add(Wildcard.class); - plugins.add(LocalStateCompositeXPackPlugin.class); - return plugins; - } - - - public void testKeywordAndWildcardSearchesConcur() throws Exception { - - int numSearches = 100; - for (int i = 0; i < numSearches; i++) { - String randomWildcardPattern = getRandomWildcardPattern(); - SearchResponse wildcardResponse = client().prepareSearch("idx") - .setQuery(new WildcardQueryBuilder("data", randomWildcardPattern)) - .get(); - - assertSearchResponse(wildcardResponse); - - SearchResponse keywordResponse = client().prepareSearch("idx") - .setQuery(new WildcardQueryBuilder("data.asKeyword", randomWildcardPattern)) - .get(); - assertSearchResponse(keywordResponse); - assertThat(wildcardResponse.getHits().getTotalHits().value, equalTo(keywordResponse.getHits().getTotalHits().value)); - - } - } - - public void testKeywordAndBInaryDvWildcardSearchesConcur() throws Exception { - - int numSearches = 100; - for (int i = 0; i < numSearches; i++) { - String randomWildcardPattern = getRandomWildcardPattern(); - SearchResponse wildcardResponse = client().prepareSearch("idx") - .setQuery(new WildcardQueryBuilder("data.asBinaryDV", randomWildcardPattern)) - .get(); - - assertSearchResponse(wildcardResponse); - - SearchResponse keywordResponse = client().prepareSearch("idx") - .setQuery(new WildcardQueryBuilder("data.asKeyword", randomWildcardPattern)) - .get(); - assertSearchResponse(keywordResponse); - assertThat(wildcardResponse.getHits().getTotalHits().value, equalTo(keywordResponse.getHits().getTotalHits().value)); - - } - } - - - public void testWildcardPosAndKeywordSearchesConcur() throws Exception { - - int numSearches = 100; - for (int i = 0; i < numSearches; i++) { - String randomWildcardPattern = getRandomWildcardPattern(); - SearchResponse wildcardResponse = client().prepareSearch("idx") - .setQuery(new WildcardQueryBuilder("data.asPosWildcard", randomWildcardPattern)) - .get(); - - assertSearchResponse(wildcardResponse); - - SearchResponse keywordResponse = client().prepareSearch("idx") - .setQuery(new WildcardQueryBuilder("data.asKeyword", randomWildcardPattern)) - .get(); - assertSearchResponse(keywordResponse); - assertThat(wildcardResponse.getHits().getTotalHits().value, equalTo(keywordResponse.getHits().getTotalHits().value)); - - } - } - - public void testIgnoreAbove() throws Exception { - SearchResponse wildcardResponse = client().prepareSearch("idx").setQuery(new WildcardQueryBuilder("data", TOO_BIG_PREFIX + "*")) - .get(); - assertSearchResponse(wildcardResponse); - assertThat(wildcardResponse.getHits().getTotalHits().value, equalTo(0L)); - } - - private void randomSyntaxChar(StringBuilder sb) { - switch (randomInt(3)) { - case 0: - sb.append(WildcardQuery.WILDCARD_CHAR); - break; - case 1: - sb.append(WildcardQuery.WILDCARD_STRING); - break; - case 2: - sb.append(WildcardQuery.WILDCARD_ESCAPE); - sb.append(WildcardQuery.WILDCARD_STRING); - break; - case 3: - sb.append(WildcardQuery.WILDCARD_ESCAPE); - sb.append(WildcardQuery.WILDCARD_CHAR); - break; - } - } - - private String getRandomWildcardPattern() { - StringBuilder sb = new StringBuilder(); - - int numFragments = 1+randomInt(4); - - if(randomInt(10)==1) { - randomSyntaxChar(sb); - } - for (int i = 0; i < numFragments; i++) { - if(i>0) { - randomSyntaxChar(sb); - } - sb.append(randomABString(1+randomInt(6))); - } - if(randomInt(10)==1) { - randomSyntaxChar(sb); - } - return sb.toString(); - } - -} diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java index 7f9b9ac3e4c22..bdc9b98c8f8eb 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -42,12 +42,14 @@ public class WildcardFieldMapperTests extends ESTestCase { private static final String KEYWORD_FIELD_NAME = "keyword_field"; private static final String WILDCARD_FIELD_NAME = "wildcard_field"; + static final int MAX_FIELD_LENGTH = 100; static WildcardFieldMapper wildcardFieldType; @Override @Before public void setUp() throws Exception { Builder builder = new WildcardFieldMapper.Builder(WILDCARD_FIELD_NAME); + builder.ignoreAbove(MAX_FIELD_LENGTH); wildcardFieldType = builder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0))); super.setUp(); } @@ -64,7 +66,32 @@ public void testIllegalIndexedArgument() { MapperParsingException e = expectThrows(MapperParsingException.class, () -> ft.index(false)); assertEquals("The field [test] cannot have index = false", e.getMessage()); - } + } + + public void testTooBigKeywordField() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setMergePolicy(newTieredMergePolicy(random())); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + + // Create a string that is too large and will not be indexed + String docContent = randomABString(MAX_FIELD_LENGTH + 1); + createDocs(docContent, iw); + iw.forceMerge(1); + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + + Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery("*a*", null, null); + TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER); + assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L)); + + reader.close(); + dir.close(); + } + public void testSearchResultsVersusKeywordField() throws IOException { Directory dir = newDirectory(); @@ -75,7 +102,7 @@ public void testSearchResultsVersusKeywordField() throws IOException { int numDocs = 100; HashSet values = new HashSet<>(); for (int i = 0; i < numDocs; i++) { - String docContent = randomABString(1 + randomInt(MAX_FIELD_LENGTH)); + String docContent = randomABString(1 + randomInt(MAX_FIELD_LENGTH - 1)); if (values.contains(docContent) == false) { createDocs(docContent, iw); values.add(docContent); @@ -130,7 +157,6 @@ protected IndexSettings createIndexSettings() { Settings.EMPTY); } - static final int MAX_FIELD_LENGTH = 100; static String randomABString(int minLength) { StringBuilder sb = new StringBuilder(); From 3cb80a49d19570109d81466c7fb8f7283ccc00a5 Mon Sep 17 00:00:00 2001 From: markharwood Date: Tue, 25 Feb 2020 12:08:10 +0000 Subject: [PATCH 09/32] Added support for aggs --- .../fielddata/plain/DocValuesIndexFieldData.java | 11 +++++++++++ .../xpack/wildcard/mapper/WildcardFieldMapper.java | 12 +++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/fielddata/plain/DocValuesIndexFieldData.java b/server/src/main/java/org/elasticsearch/index/fielddata/plain/DocValuesIndexFieldData.java index 529bdb84b12ac..039338299d82b 100644 --- a/server/src/main/java/org/elasticsearch/index/fielddata/plain/DocValuesIndexFieldData.java +++ b/server/src/main/java/org/elasticsearch/index/fielddata/plain/DocValuesIndexFieldData.java @@ -66,6 +66,17 @@ public final void clear(IndexReader reader) { public final Index index() { return index; } + + public static class BinaryBuilder implements IndexFieldData.Builder { + + @Override + public IndexFieldData build(IndexSettings indexSettings, MappedFieldType fieldType, IndexFieldDataCache cache, + CircuitBreakerService breakerService, MapperService mapperService) { + // Ignore Circuit Breaker + return new BinaryDVIndexFieldData(indexSettings.getIndex(), fieldType.name()); + } + + } public static class Builder implements IndexFieldData.Builder { private static final Set BINARY_INDEX_FIELD_NAMES = unmodifiableSet(newHashSet(IdFieldMapper.NAME)); diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index b151819ba8f9d..4b51f8885bb56 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -36,6 +36,8 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.fielddata.plain.DocValuesIndexFieldData; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.Mapper; @@ -140,6 +142,7 @@ protected void setupFieldType(BuilderContext context) { super.setupFieldType(context); fieldType().setNumChars(numChars); fieldType().setMatchType(matchType); + fieldType().setHasDocValues(true); if (matchType.equals(Defaults.MATCH_TYPE_POSITION)) { fieldType().setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); } else { @@ -474,7 +477,14 @@ public Query termsQuery(List values, QueryShardContext context) { bq.add(termQuery(value, context), Occur.SHOULD); } return new ConstantScoreQuery(bq.build()); - } + } + + @Override + public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) { + failIfNoDocValues(); + return new DocValuesIndexFieldData.BinaryBuilder(); + } + } private int ignoreAbove; From a4af4c761f24db4d53c01674fe36d955aeaab937 Mon Sep 17 00:00:00 2001 From: markharwood Date: Tue, 25 Feb 2020 13:13:25 +0000 Subject: [PATCH 10/32] =?UTF-8?q?Remove=20redundant=20code=20now=20that=20?= =?UTF-8?q?we=E2=80=99ve=20settled=20on=203grams=20with=20binary=20doc=20v?= =?UTF-8?q?alues?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../wildcard/mapper/WildcardFieldMapper.java | 92 +------- .../wildcard/mapper/WildcardOnDvQuery.java | 99 --------- .../mapper/WildcardPositionBasedQuery.java | 203 ------------------ 3 files changed, 9 insertions(+), 385 deletions(-) delete mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnDvQuery.java delete mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardPositionBasedQuery.java diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 4b51f8885bb56..98dcd63b161c8 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -13,7 +13,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Field; -import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; @@ -72,18 +71,12 @@ public static class Defaults { FIELD_TYPE.setOmitNorms(true); FIELD_TYPE.freeze(); } - public static final int IGNORE_ABOVE = Integer.MAX_VALUE; - public static final String MATCH_TYPE_DOC_VALUES = "doc_values"; - public static final String MATCH_TYPE_BINARY_DOC_VALUES = "binary_doc_values"; - public static final String MATCH_TYPE_POSITION = "positions"; - + public static final int IGNORE_ABOVE = Integer.MAX_VALUE; } public static class Builder extends FieldMapper.Builder { private int numChars = 3; protected int ignoreAbove = Defaults.IGNORE_ABOVE; - protected String matchType = Defaults.MATCH_TYPE_BINARY_DOC_VALUES; - public Builder(String name) { super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); @@ -124,30 +117,14 @@ public Builder ignoreAbove(int ignoreAbove) { this.ignoreAbove = ignoreAbove; return this; } - - public Builder matchType(String matchType) { - if (Defaults.MATCH_TYPE_DOC_VALUES.equals(matchType) == false && - Defaults.MATCH_TYPE_BINARY_DOC_VALUES.equals(matchType) == false && - Defaults.MATCH_TYPE_POSITION.equals(matchType) == false) { - throw new IllegalArgumentException("[match_type] must be " + Defaults.MATCH_TYPE_DOC_VALUES + " or " - + Defaults.MATCH_TYPE_BINARY_DOC_VALUES + " or " - + Defaults.MATCH_TYPE_POSITION + ", got " + matchType); - } - this.matchType = matchType; - return this; - } + @Override protected void setupFieldType(BuilderContext context) { super.setupFieldType(context); fieldType().setNumChars(numChars); - fieldType().setMatchType(matchType); fieldType().setHasDocValues(true); - if (matchType.equals(Defaults.MATCH_TYPE_POSITION)) { - fieldType().setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); - } else { - fieldType().setIndexOptions(IndexOptions.DOCS); - } + fieldType().setIndexOptions(IndexOptions.DOCS); } @Override @@ -159,24 +136,9 @@ public WildcardFieldType fieldType() { public WildcardFieldMapper build(BuilderContext context) { setupFieldType(context); return new WildcardFieldMapper( - name, fieldType, defaultFieldType, ignoreAbove, matchType, numChars, + name, fieldType, defaultFieldType, ignoreAbove, numChars, context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo); } - - @Override - public Builder indexOptions(IndexOptions indexOptions) { - // Suspected parse sequencing problem here - if match_type not set yet we don't know - // if this is appropriate or not.... -// if (matchType.equals(Defaults.MATCH_TYPE_DOC_VALUES)) { -// if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) > 0) { -// throw new IllegalArgumentException("The [wildcard] field does not support positions with match_type " -// + Defaults.MATCH_TYPE_DOC_VALUES+", got [index_options]=" -// + indexOptionToString(indexOptions)); -// } -// } - return super.indexOptions(indexOptions); - } - } public static class TypeParser implements Mapper.TypeParser { @@ -199,9 +161,6 @@ public static class TypeParser implements Mapper.TypeParser { } else if (propName.equals("ignore_above")) { builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1)); iterator.remove(); - } else if (propName.equals("match_type")) { - builder.matchType(XContentMapValues.nodeStringValue(propNode, Defaults.MATCH_TYPE_BINARY_DOC_VALUES)); - iterator.remove(); } } @@ -216,7 +175,6 @@ public static class TypeParser implements Mapper.TypeParser { public static final class WildcardFieldType extends MappedFieldType { private int numChars; - private String matchType; public WildcardFieldType() { setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); @@ -348,11 +306,6 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh PatternStructure patternStructure = new PatternStructure(wildcardPattern); - if (matchType.equals(Defaults.MATCH_TYPE_POSITION)) { - return new WildcardPositionBasedQuery(name(), patternStructure, numChars); - } - - BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); for (int i = 0; i < patternStructure.fragments.length; i++) { @@ -432,12 +385,7 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh if (approximation.clauses().size() > 1 || patternStructure.needsVerification()) { BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder(); verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST)); - if (matchType.equals(Defaults.MATCH_TYPE_DOC_VALUES)) { - verifyingBuilder.add(new BooleanClause(new WildcardOnDvQuery(name(), wildcardPattern), Occur.MUST)); - } else { - assert matchType.equals(Defaults.MATCH_TYPE_BINARY_DOC_VALUES); - verifyingBuilder.add(new BooleanClause(new WildcardOnBinaryDvQuery(name(), wildcardPattern), Occur.MUST)); - } + verifyingBuilder.add(new BooleanClause(new WildcardOnBinaryDvQuery(name(), wildcardPattern), Occur.MUST)); return verifyingBuilder.build(); } return approximation; @@ -451,10 +399,6 @@ void setNumChars(int numChars) { this.numChars = numChars; } - void setMatchType(String matchType) { - this.matchType = matchType; - } - @Override public String typeName() { return CONTENT_TYPE; @@ -489,21 +433,13 @@ public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) { private int ignoreAbove; private int numChars; - private String matchType; private WildcardFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, - int ignoreAbove, String matchType, int numChars, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { + int ignoreAbove, int numChars, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); this.ignoreAbove = ignoreAbove; - this.matchType = matchType; this.numChars = numChars; - if (matchType.equals(Defaults.MATCH_TYPE_POSITION)) { - assert fieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; - - } else { - assert fieldType.indexOptions() == IndexOptions.DOCS; - - } + assert fieldType.indexOptions() == IndexOptions.DOCS; } @@ -531,9 +467,6 @@ protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) { builder.field("ignore_above", ignoreAbove); } - if (includeDefaults || matchType != Defaults.MATCH_TYPE_BINARY_DOC_VALUES) { - builder.field("match_type", matchType); - } } @Override @@ -563,14 +496,8 @@ void createFields(String value, Listfields) { Field field = new Field(fieldType().name(), filter, fieldType()); fields.add(field); - if (matchType.equals(Defaults.MATCH_TYPE_DOC_VALUES)) { - Field dvField = new SortedSetDocValuesField(fieldType().name(), new BytesRef(value)); - fields.add(dvField); - } - if (matchType.equals(Defaults.MATCH_TYPE_BINARY_DOC_VALUES)) { - Field dvField = new BinaryDocValuesField(fieldType().name(), new BytesRef(value)); - fields.add(dvField); - } + Field dvField = new BinaryDocValuesField(fieldType().name(), new BytesRef(value)); + fields.add(dvField); } @Override @@ -583,7 +510,6 @@ protected String contentType() { protected void doMerge(Mapper mergeWith) { super.doMerge(mergeWith); this.ignoreAbove = ((WildcardFieldMapper) mergeWith).ignoreAbove; - this.matchType = ((WildcardFieldMapper) mergeWith).matchType; this.numChars = ((WildcardFieldMapper) mergeWith).numChars; } } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnDvQuery.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnDvQuery.java deleted file mode 100644 index 596f2e962b7be..0000000000000 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnDvQuery.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ - -package org.elasticsearch.xpack.wildcard.mapper; - -import org.apache.lucene.index.DocValues; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.ConstantScoreScorer; -import org.apache.lucene.search.ConstantScoreWeight; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.TwoPhaseIterator; -import org.apache.lucene.search.Weight; -import org.apache.lucene.search.WildcardQuery; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.ByteRunAutomaton; - -import java.io.IOException; -import java.util.Objects; - -/** - * Query that runs a wildcard pattern across all doc values. - * Expensive to run so normally used in conjunction with more selective query clauses. - */ -public class WildcardOnDvQuery extends Query { - - private final String field; - private final String wildcardPattern; - - public WildcardOnDvQuery(String field, String wildcardPattern) { - this.field = field; - this.wildcardPattern = wildcardPattern; - } - - @Override - public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - - Automaton automaton = WildcardQuery.toAutomaton(new Term(field,wildcardPattern)); - ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton); - - return new ConstantScoreWeight(this, boost) { - - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - SortedSetDocValues values = DocValues.getSortedSet(context.reader(), field); - TwoPhaseIterator twoPhase = new TwoPhaseIterator(values) { - @Override - public boolean matches() throws IOException { - long ord = values.nextOrd(); - while (ord != SortedSetDocValues.NO_MORE_ORDS) { - BytesRef value = values.lookupOrd(ord); - if (bytesMatcher.run(value.bytes, value.offset, value.length)) { - return true; - } - ord = values.nextOrd(); - } - return false; - } - - @Override - public float matchCost() { - // TODO: how can we compute this? - return 1000f; - } - }; - return new ConstantScoreScorer(this, score(), scoreMode, twoPhase); - } - - @Override - public boolean isCacheable(LeafReaderContext ctx) { - return true; - } - }; - } - @Override - public String toString(String field) { - return field+":"+wildcardPattern; - } - - @Override - public boolean equals(Object obj) { - WildcardOnDvQuery other = (WildcardOnDvQuery) obj; - return Objects.equals(field, other.field) && Objects.equals(wildcardPattern, other.wildcardPattern); - } - - @Override - public int hashCode() { - return Objects.hash(field, wildcardPattern); - } - -} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardPositionBasedQuery.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardPositionBasedQuery.java deleted file mode 100644 index 8dd2032726550..0000000000000 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardPositionBasedQuery.java +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ - -package org.elasticsearch.xpack.wildcard.mapper; - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.ngram.NGramTokenFilter; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.queries.XIntervals; -import org.apache.lucene.queries.intervals.IntervalQuery; -import org.apache.lucene.queries.intervals.Intervals; -import org.apache.lucene.queries.intervals.IntervalsSource; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.WildcardQuery; -import org.apache.lucene.util.BytesRef; -import org.elasticsearch.ElasticsearchParseException; -import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper.WildcardFieldType.PatternStructure; - -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.Objects; - -public class WildcardPositionBasedQuery extends Query{ - - private final String field; - private final PatternStructure wildcardPattern; - private final int numChars; - - public WildcardPositionBasedQuery(String field, PatternStructure wildcardPattern, int numChars) { - this.field = field; - this.wildcardPattern = wildcardPattern; - this.numChars = numChars; - } - - - private void addFragment(ArrayList spans, int fragmentNum, IntervalsSource newFragment) { - // TODO mixtures of ? and * in a sequence e.g. aa?* are problematic because we only have the max gap - //(Integer.maxInt in this case) and not the min (which is 1 in this example). - int precedingGapSize = wildcardPattern.getPrecedingGapSize(fragmentNum); - if ( precedingGapSize >0 && precedingGapSize < Integer.MAX_VALUE) { - if (spans.size() == 0) { - IntervalsSource wildcard = XIntervals.wildcard(new BytesRef(WildcardFieldMapper.TOKEN_START_OR_END_CHAR + "*")); - IntervalsSource addedGap = Intervals.extend(wildcard, 0, precedingGapSize); - IntervalsSource phrase = Intervals.phrase(addedGap, newFragment); - spans.add(phrase); - return; - } - IntervalsSource lastFragment = spans.get(spans.size()-1); - - IntervalsSource addedGap = Intervals.extend(lastFragment, 0, precedingGapSize); - IntervalsSource phrase = Intervals.phrase(addedGap, newFragment); - spans.set(spans.size()-1, phrase); - - - } else { - spans.add(newFragment); - } - } - - - - - private String escapeWildcards(String s) { - s = s.replace(Character.toString(WildcardQuery.WILDCARD_CHAR), - Character.toString(WildcardQuery.WILDCARD_ESCAPE)+Character.toString(WildcardQuery.WILDCARD_CHAR)); - s = s.replace(Character.toString(WildcardQuery.WILDCARD_STRING), - Character.toString(WildcardQuery.WILDCARD_ESCAPE)+Character.toString(WildcardQuery.WILDCARD_STRING)); - return s; - } - - @Override - public Query rewrite(IndexReader reader) throws IOException { - - - ArrayList spans = new ArrayList<>(); - - - for (int i = 0; i < wildcardPattern.fragments.length; i++) { - String fragment = wildcardPattern.fragments[i]; - int fLength = fragment.length(); - if (fLength == 0) { - continue; - } - - // Add any start/end of string character - if (i == 0 && wildcardPattern.openStart == false) { - // Start-of-string anchored (is not a leading wildcard) - fragment = WildcardFieldMapper.TOKEN_START_OR_END_CHAR + fragment; - } - if (wildcardPattern.openEnd == false && i == wildcardPattern.fragments.length - 1) { - // End-of-string anchored (is not a trailing wildcard) - fragment = fragment + WildcardFieldMapper.TOKEN_START_OR_END_CHAR; - } - - if (fragment.length() == numChars) { - IntervalsSource addedGap = Intervals.extend(Intervals.term(fragment), 0, fragment.length()-1); - addFragment(spans, i, addedGap); - } else if (fragment.length() > numChars) { - // Break fragment into multiple Ngrams - - - ArrayList fragmentRun = new ArrayList<>(); - KeywordTokenizer kt = new KeywordTokenizer(256); - kt.setReader(new StringReader(fragment)); - TokenFilter filter = new NGramTokenFilter(kt, numChars, numChars, false); - CharTermAttribute termAtt = filter.addAttribute(CharTermAttribute.class); - try { - filter.reset(); - int nextRequiredCoverage = 0; - int charPos = 0; - int endOfOptimisableSection = fragment.length() - (numChars * 2); - - while (filter.incrementToken()) { - if (charPos < endOfOptimisableSection) { - if (charPos == nextRequiredCoverage) { - IntervalsSource iTerm = Intervals.term(termAtt.toString()); - fragmentRun.add(Intervals.extend(iTerm, 0, termAtt.length() - 1)); - // optimise - skip unnecessary overlapping tokens - nextRequiredCoverage = charPos + termAtt.length(); - } - } else { - // We are into the tail of the string that can't be optimised by skipping - if (charPos >= nextRequiredCoverage) { - fragmentRun.add(Intervals.term(termAtt.toString())); - if (charPos + termAtt.length() >= fragment.length()) { - // we've achieved full coverage of the pattern now - break; - } - } - } - charPos++; - } - kt.end(); - kt.close(); - } catch(IOException ioe) { - throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment ["+fragment+"]"); - } - - IntervalsSource phrase = Intervals.phrase(fragmentRun.toArray(new IntervalsSource[0])); - IntervalsSource addedGap = Intervals.extend(phrase, 0, numChars - 1); - addFragment(spans, i, addedGap); - - } else { - // fragment is smaller than smallest ngram size - if (wildcardPattern.openEnd || i < wildcardPattern.fragments.length - 1) { - // fragment occurs mid-string so will need a wildcard query - IntervalsSource wildcard = XIntervals.wildcard(new BytesRef(escapeWildcards(fragment) + "*")); - - IntervalsSource addedGap = Intervals.extend(wildcard, 0, fragment.length()-1); - addFragment(spans, i, addedGap); - } else { - // fragment occurs at end of string so can rely on Jim's indexing rule to optimise - // *foo by indexing smaller ngrams at the end of a string - IntervalsSource addedGap = Intervals.extend(Intervals.term(fragment), 0, fragment.length()-1); - addFragment(spans, i, addedGap); - } - } - } - - if (wildcardPattern.lastGap > 0 && wildcardPattern.lastGap < Integer.MAX_VALUE) { - IntervalsSource lastFragment = spans.get(spans.size() - 1); - IntervalsSource addedGap = Intervals.extend(lastFragment, 0, wildcardPattern.lastGap); - IntervalsSource fieldEnd = Intervals.term(new BytesRef("" + WildcardFieldMapper.TOKEN_START_OR_END_CHAR)); - IntervalsSource phrase = Intervals.phrase(addedGap, fieldEnd); - spans.set(spans.size() - 1, phrase); - } - - if (spans.size()==1) { - IntervalQuery iq = new IntervalQuery(field, spans.get(0)); - return iq; - } - - IntervalQuery iq = new IntervalQuery(field, Intervals.ordered(spans.toArray(new IntervalsSource[0]))); - return iq; - } - - @Override - public boolean equals(Object obj) { - WildcardPositionBasedQuery other = (WildcardPositionBasedQuery) obj; - return Objects.equals(field, other.field) && Objects.equals(wildcardPattern, other.wildcardPattern) && - Objects.equals(numChars, other.numChars); - } - - @Override - public int hashCode() { - return Objects.hash(field, wildcardPattern, numChars); - } - - - - - @Override - public String toString(String field) { - return field + ":" + wildcardPattern.pattern; - } - -} From f9893cdc1ede007b224640418fe0f4102b4baa1c Mon Sep 17 00:00:00 2001 From: markharwood Date: Wed, 26 Feb 2020 11:38:43 +0000 Subject: [PATCH 11/32] Bugfix - BinaryDVIndexFieldData.sortField had the wrong implementation. Added sort test to compare keyword field results with wildcard field - behaviour should be equivalent. --- .../plain/BinaryDVIndexFieldData.java | 18 +---- .../mapper/WildcardFieldMapperTests.java | 71 +++++++++++++++++++ 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java b/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java index 27088382f2025..d9bd12f19522c 100644 --- a/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java +++ b/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java @@ -21,8 +21,6 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.SortField; -import org.apache.lucene.search.SortedSetSortField; -import org.apache.lucene.search.SortedSetSelector; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.index.Index; @@ -53,21 +51,9 @@ public BinaryDVAtomicFieldData loadDirect(LeafReaderContext context) throws Exce @Override public SortField sortField(@Nullable Object missingValue, MultiValueMode sortMode, XFieldComparatorSource.Nested nested, boolean reverse) { - XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, sortMode, nested); - /** - * Check if we can use a simple {@link SortedSetSortField} compatible with index sorting and - * returns a custom sort field otherwise. - */ - if (nested != null || - (sortMode != MultiValueMode.MAX && sortMode != MultiValueMode.MIN) || - (source.sortMissingFirst(missingValue) == false && source.sortMissingLast(missingValue) == false)) { + XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, + sortMode, nested); return new SortField(getFieldName(), source, reverse); - } - SortField sortField = new SortedSetSortField(fieldName, reverse, - sortMode == MultiValueMode.MAX ? SortedSetSelector.Type.MAX : SortedSetSelector.Type.MIN); - sortField.setMissingValue(source.sortMissingLast(missingValue) ^ reverse ? - SortedSetSortField.STRING_LAST : SortedSetSortField.STRING_FIRST); - return sortField; } @Override diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java index bdc9b98c8f8eb..1e5509aa04ba7 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -8,6 +8,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriterConfig; @@ -15,26 +16,41 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.cache.bitset.BitsetFilterCache; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.fielddata.IndexFieldDataCache; import org.elasticsearch.index.mapper.ContentPath; +import org.elasticsearch.index.mapper.KeywordFieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.search.sort.FieldSortBuilder; import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper.Builder; import org.junit.Before; +import org.mockito.Mockito; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; +import java.util.function.BiFunction; import static org.hamcrest.Matchers.equalTo; @@ -44,6 +60,7 @@ public class WildcardFieldMapperTests extends ESTestCase { private static final String WILDCARD_FIELD_NAME = "wildcard_field"; static final int MAX_FIELD_LENGTH = 100; static WildcardFieldMapper wildcardFieldType; + static KeywordFieldMapper keywordFieldType; @Override @Before @@ -51,6 +68,10 @@ public void setUp() throws Exception { Builder builder = new WildcardFieldMapper.Builder(WILDCARD_FIELD_NAME); builder.ignoreAbove(MAX_FIELD_LENGTH); wildcardFieldType = builder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0))); + + + org.elasticsearch.index.mapper.KeywordFieldMapper.Builder kwBuilder = new KeywordFieldMapper.Builder(KEYWORD_FIELD_NAME); + keywordFieldType = kwBuilder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0))); super.setUp(); } @@ -135,10 +156,58 @@ public void testSearchResultsVersusKeywordField() throws IOException { } assertThat(expectedDocs.size(), equalTo(0)); } + + + //Test keyword and wildcard sort operations are also equivalent + QueryShardContext shardContextMock = createMockShardContext(); + + FieldSortBuilder wildcardSortBuilder = new FieldSortBuilder(WILDCARD_FIELD_NAME); + SortField wildcardSortField = wildcardSortBuilder.build(shardContextMock).field; + ScoreDoc[] wildcardHits = searcher.search(new MatchAllDocsQuery(), numDocs, new Sort(wildcardSortField)).scoreDocs; + + FieldSortBuilder keywordSortBuilder = new FieldSortBuilder(KEYWORD_FIELD_NAME); + SortField keywordSortField = keywordSortBuilder.build(shardContextMock).field; + ScoreDoc[] keywordHits = searcher.search(new MatchAllDocsQuery(), numDocs, new Sort(keywordSortField)).scoreDocs; + + assertThat(wildcardHits.length, equalTo(keywordHits.length)); + for (int i = 0; i < wildcardHits.length; i++) { + assertThat(wildcardHits[i].doc, equalTo(keywordHits[i].doc)); + } + reader.close(); dir.close(); } + + + protected MappedFieldType provideMappedFieldType(String name) { + if (name.equals(WILDCARD_FIELD_NAME)) { + return wildcardFieldType.fieldType(); + } else { + return keywordFieldType.fieldType(); + } + } + + protected final QueryShardContext createMockShardContext() { + Index index = new Index(randomAlphaOfLengthBetween(1, 10), "_na_"); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings(index, + Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build()); + BitsetFilterCache bitsetFilterCache = new BitsetFilterCache(idxSettings, Mockito.mock(BitsetFilterCache.Listener.class)); + BiFunction> indexFieldDataLookup = (fieldType, fieldIndexName) -> { + IndexFieldData.Builder builder = fieldType.fielddataBuilder(fieldIndexName); + return builder.build(idxSettings, fieldType, new IndexFieldDataCache.None(), null, null); + }; + return new QueryShardContext(0, idxSettings, BigArrays.NON_RECYCLING_INSTANCE, bitsetFilterCache, indexFieldDataLookup, + null, null, null, xContentRegistry(), null, null, null, + () -> randomNonNegativeLong(), null, null, () -> true) { + + @Override + public MappedFieldType fieldMapper(String name) { + return provideMappedFieldType(name); + } + }; + } + private void createDocs(String docContent, RandomIndexWriter iw) throws IOException { ArrayList fields = new ArrayList<>(); wildcardFieldType.createFields(docContent, fields); @@ -146,6 +215,8 @@ private void createDocs(String docContent, RandomIndexWriter iw) throws IOExcept for (IndexableField indexableField : fields) { doc.add(indexableField); } + // Add keyword fields too + doc.add(new SortedSetDocValuesField(KEYWORD_FIELD_NAME, new BytesRef(docContent))); doc.add(new StringField(KEYWORD_FIELD_NAME, docContent, Field.Store.YES)); iw.addDocument(doc); } From ce47c6c5f5e4e3c5b3d0d684aa3b08606255b045 Mon Sep 17 00:00:00 2001 From: markharwood Date: Wed, 4 Mar 2020 10:47:12 +0000 Subject: [PATCH 12/32] Addressing latest review comments --- .../elasticsearch/xpack/core/XPackField.java | 2 - .../xpack/wildcard/Wildcard.java | 16 +--- .../wildcard/WildcardInfoTransportAction.java | 43 ----------- .../WildcardUsageTransportAction.java | 74 ------------------- .../wildcard/mapper/WildcardFieldMapper.java | 35 ++++++++- .../mapper/WildcardOnBinaryDvQuery.java | 14 ++-- .../WildcardInfoTransportActionTests.java | 53 ------------- .../mapper/WildcardFieldTypeTests.java | 13 ++++ 8 files changed, 51 insertions(+), 199 deletions(-) delete mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportAction.java delete mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardUsageTransportAction.java delete mode 100644 x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportActionTests.java diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackField.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackField.java index e5caa3f4322be..3bc1a44e7b820 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackField.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackField.java @@ -44,8 +44,6 @@ public final class XPackField { /** Name constant for flattened fields. */ /** Name constant for the vectors feature. */ public static final String VECTORS = "vectors"; - /** Name constant for the wildcard feature. */ - public static final String WILDCARD = "wildcard"; /** Name constant for the voting-only-node feature. */ public static final String VOTING_ONLY = "voting_only"; /** Name constant for the frozen index feature. */ diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/Wildcard.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/Wildcard.java index 4141c96261c42..3749dc2622c8b 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/Wildcard.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/Wildcard.java @@ -6,36 +6,22 @@ package org.elasticsearch.xpack.wildcard; -import org.elasticsearch.action.ActionRequest; -import org.elasticsearch.action.ActionResponse; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.mapper.Mapper; -import org.elasticsearch.plugins.ActionPlugin; import org.elasticsearch.plugins.MapperPlugin; import org.elasticsearch.plugins.Plugin; -import org.elasticsearch.xpack.core.action.XPackInfoFeatureAction; -import org.elasticsearch.xpack.core.action.XPackUsageFeatureAction; import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper; -import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashMap; -import java.util.List; import java.util.Map; -public class Wildcard extends Plugin implements MapperPlugin, ActionPlugin { +public class Wildcard extends Plugin implements MapperPlugin { public Wildcard(Settings settings) { } - @Override - public List> getActions() { - return Arrays.asList( - new ActionPlugin.ActionHandler<>(XPackUsageFeatureAction.WILDCARD, WildcardUsageTransportAction.class), - new ActionPlugin.ActionHandler<>(XPackInfoFeatureAction.WILDCARD, WildcardInfoTransportAction.class)); - } - @Override public Map getMappers() { Map mappers = new LinkedHashMap<>(); diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportAction.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportAction.java deleted file mode 100644 index fea55a9c0662a..0000000000000 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportAction.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ -package org.elasticsearch.xpack.wildcard; - -import org.elasticsearch.action.support.ActionFilters; -import org.elasticsearch.common.inject.Inject; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.license.XPackLicenseState; -import org.elasticsearch.transport.TransportService; -import org.elasticsearch.xpack.core.XPackField; -import org.elasticsearch.xpack.core.action.XPackInfoFeatureAction; -import org.elasticsearch.xpack.core.action.XPackInfoFeatureTransportAction; - -public class WildcardInfoTransportAction extends XPackInfoFeatureTransportAction { - - private final XPackLicenseState licenseState; - - @Inject - public WildcardInfoTransportAction(TransportService transportService, ActionFilters actionFilters, - Settings settings, XPackLicenseState licenseState) { - super(XPackInfoFeatureAction.WILDCARD.name(), transportService, actionFilters); - this.licenseState = licenseState; - } - - @Override - public String name() { - return XPackField.WILDCARD; - } - - @Override - public boolean available() { - return licenseState.isWildcardAllowed(); - } - - @Override - public boolean enabled() { - return true; - } - -} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardUsageTransportAction.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardUsageTransportAction.java deleted file mode 100644 index 96bb8452034ee..0000000000000 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/WildcardUsageTransportAction.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ -package org.elasticsearch.xpack.wildcard; - -import org.elasticsearch.action.ActionListener; -import org.elasticsearch.action.support.ActionFilters; -import org.elasticsearch.cluster.ClusterState; -import org.elasticsearch.cluster.metadata.IndexMetaData; -import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; -import org.elasticsearch.cluster.metadata.MappingMetaData; -import org.elasticsearch.cluster.service.ClusterService; -import org.elasticsearch.common.inject.Inject; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.license.XPackLicenseState; -import org.elasticsearch.protocol.xpack.XPackUsageRequest; -import org.elasticsearch.tasks.Task; -import org.elasticsearch.threadpool.ThreadPool; -import org.elasticsearch.transport.TransportService; -import org.elasticsearch.xpack.core.action.XPackUsageFeatureAction; -import org.elasticsearch.xpack.core.action.XPackUsageFeatureResponse; -import org.elasticsearch.xpack.core.action.XPackUsageFeatureTransportAction; -import org.elasticsearch.xpack.core.wildcard.WildcardFeatureSetUsage; -import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper; - -import java.util.Map; - -public class WildcardUsageTransportAction extends XPackUsageFeatureTransportAction { - - private final Settings settings; - private final XPackLicenseState licenseState; - - @Inject - public WildcardUsageTransportAction(TransportService transportService, ClusterService clusterService, ThreadPool threadPool, - ActionFilters actionFilters, IndexNameExpressionResolver indexNameExpressionResolver, - Settings settings, XPackLicenseState licenseState) { - super(XPackUsageFeatureAction.WILDCARD.name(), transportService, clusterService, - threadPool, actionFilters, indexNameExpressionResolver); - this.settings = settings; - this.licenseState = licenseState; - } - - @Override - protected void masterOperation(Task task, XPackUsageRequest request, ClusterState state, - ActionListener listener) { - boolean wildcardAvailable = licenseState.isWildcardAllowed(); - int numWildcardFields = 0; - - if (wildcardAvailable && state != null) { - for (IndexMetaData indexMetaData : state.metaData()) { - MappingMetaData mappingMetaData = indexMetaData.mapping(); - if (mappingMetaData != null) { - Map mappings = mappingMetaData.getSourceAsMap(); - if (mappings.containsKey("properties")) { - @SuppressWarnings("unchecked") Map> fieldMappings = - (Map>) mappings.get("properties"); - for (Map typeDefinition : fieldMappings.values()) { - String fieldType = (String) typeDefinition.get("type"); - if (fieldType != null) { - if (fieldType.equals(WildcardFieldMapper.CONTENT_TYPE)) { - numWildcardFields++; - } - } - } - } - } - } - } - WildcardFeatureSetUsage usage = new WildcardFeatureSetUsage(wildcardAvailable, numWildcardFields); - listener.onResponse(new XPackUsageFeatureResponse(usage)); - } -} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 98dcd63b161c8..35738f3382bd3 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -28,6 +28,7 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.lucene.Lucene; @@ -50,6 +51,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import static org.elasticsearch.index.mapper.TypeParsers.parseField; @@ -85,7 +87,7 @@ public Builder(String name) { @Override public Builder docValues(boolean docValues) { - if(docValues == false) { + if (docValues == false) { throw new MapperParsingException("The field [" + name + "] cannot have doc values = false"); } @@ -94,7 +96,7 @@ public Builder docValues(boolean docValues) { @Override public Builder index(boolean index) { - if(index == false) { + if (index == false) { throw new MapperParsingException("The field [" + name + "] cannot have index = false"); } @@ -186,9 +188,22 @@ protected WildcardFieldType(WildcardFieldType ref) { } public WildcardFieldType clone() { - return new WildcardFieldType(this); + WildcardFieldType result = new WildcardFieldType(this); + result.setNumChars(numChars); + return result; } + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), numChars); + } + + @Override + public boolean equals(Object o) { + if (!super.equals(o)) return false; + WildcardFieldType that = (WildcardFieldType) o; + return numChars == that.numChars; + } // Holds parsed information about the wildcard pattern static class PatternStructure { @@ -385,7 +400,8 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh if (approximation.clauses().size() > 1 || patternStructure.needsVerification()) { BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder(); verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST)); - verifyingBuilder.add(new BooleanClause(new WildcardOnBinaryDvQuery(name(), wildcardPattern), Occur.MUST)); + Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), wildcardPattern)); + verifyingBuilder.add(new BooleanClause(new WildcardOnBinaryDvQuery(name(), wildcardPattern, automaton), Occur.MUST)); return verifyingBuilder.build(); } return approximation; @@ -396,8 +412,19 @@ int numChars() { } void setNumChars(int numChars) { + checkIfFrozen(); this.numChars = numChars; } + + @Override + public void checkCompatibility(MappedFieldType fieldType, List conflicts) { + super.checkCompatibility(fieldType, conflicts); + WildcardFieldType other = (WildcardFieldType)fieldType; + // prevent user from changing num_chars + if (numChars() != other.numChars()) { + conflicts.add("mapper [" + name() + "] has different [num_chars]"); + } + } @Override public String typeName() { diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java index f058f8b0fada9..66f9e9159d5fa 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java @@ -34,16 +34,17 @@ public class WildcardOnBinaryDvQuery extends Query { private final String field; private final String wildcardPattern; + private Automaton automaton; - public WildcardOnBinaryDvQuery(String field, String wildcardPattern) { + public WildcardOnBinaryDvQuery(String field, String wildcardPattern, Automaton automaton) { this.field = field; this.wildcardPattern = wildcardPattern; + this.automaton = automaton; } @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - - Automaton automaton = WildcardQuery.toAutomaton(new Term(field,wildcardPattern)); + ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton); return new ConstantScoreWeight(this, boost) { @@ -54,11 +55,8 @@ public Scorer scorer(LeafReaderContext context) throws IOException { TwoPhaseIterator twoPhase = new TwoPhaseIterator(values) { @Override public boolean matches() throws IOException { - if (values.advanceExact(approximation.docID())) { - BytesRef value = values.binaryValue(); - return bytesMatcher.run(value.bytes, value.offset, value.length); - } - return false; + BytesRef value = values.binaryValue(); + return bytesMatcher.run(value.bytes, value.offset, value.length); } @Override diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportActionTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportActionTests.java deleted file mode 100644 index 791afeb57c090..0000000000000 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/WildcardInfoTransportActionTests.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ -package org.elasticsearch.xpack.wildcard; - -import org.elasticsearch.action.support.ActionFilters; -import org.elasticsearch.action.support.PlainActionFuture; -import org.elasticsearch.common.io.stream.BytesStreamOutput; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.license.XPackLicenseState; -import org.elasticsearch.test.ESTestCase; -import org.elasticsearch.transport.TransportService; -import org.elasticsearch.xpack.core.XPackFeatureSet; -import org.elasticsearch.xpack.core.action.XPackUsageFeatureResponse; -import org.elasticsearch.xpack.core.wildcard.WildcardFeatureSetUsage; -import org.junit.Before; - -import static org.hamcrest.core.Is.is; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -public class WildcardInfoTransportActionTests extends ESTestCase { - - private XPackLicenseState licenseState; - - @Before - public void init() { - licenseState = mock(XPackLicenseState.class); - } - - public void testAvailable() throws Exception { - WildcardInfoTransportAction featureSet = new WildcardInfoTransportAction( - mock(TransportService.class), mock(ActionFilters.class), Settings.EMPTY, licenseState); - boolean available = randomBoolean(); - when(licenseState.isWildcardAllowed()).thenReturn(available); - assertThat(featureSet.available(), is(available)); - - var usageAction = new WildcardUsageTransportAction(mock(TransportService.class), null, null, - mock(ActionFilters.class), null, Settings.EMPTY, licenseState); - PlainActionFuture future = new PlainActionFuture<>(); - usageAction.masterOperation(null, null, null, future); - XPackFeatureSet.Usage usage = future.get().getUsage(); - assertThat(usage.available(), is(available)); - - BytesStreamOutput out = new BytesStreamOutput(); - usage.writeTo(out); - XPackFeatureSet.Usage serializedUsage = new WildcardFeatureSetUsage(out.bytes().streamInput()); - assertThat(serializedUsage.available(), is(available)); - } - -} diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java index ff5c19ed2df6c..b3c8f49f5343f 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java @@ -9,6 +9,8 @@ import org.elasticsearch.index.mapper.FieldTypeTestCase; import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper.WildcardFieldType; +import org.junit.Before; public class WildcardFieldTypeTests extends FieldTypeTestCase { @@ -16,4 +18,15 @@ public class WildcardFieldTypeTests extends FieldTypeTestCase { protected MappedFieldType createDefaultFieldType() { return new WildcardFieldMapper.WildcardFieldType(); } + + @Before + public void setupProperties() { + addModifier(new Modifier("num_chars", false) { + @Override + public void modify(MappedFieldType ft) { + WildcardFieldType fieldType = (WildcardFieldType) ft; + fieldType.setNumChars(5); + } + }); + } } From 0714c1386ab135274f2f5467733ec15f6362b802 Mon Sep 17 00:00:00 2001 From: markharwood Date: Wed, 4 Mar 2020 11:17:07 +0000 Subject: [PATCH 13/32] Renamed field from `wildcard_keyword ` to `wildcard` --- docs/reference/mapping/types.asciidoc | 2 +- ...ard-keyword.asciidoc => wildcard.asciidoc} | 16 ++--- .../core/action/XPackInfoFeatureAction.java | 1 - .../core/action/XPackUsageFeatureAction.java | 1 - .../wildcard/WildcardFeatureSetUsage.java | 60 ------------------- .../test/wildcard/10_wildcard_basic.yml | 16 ++--- .../wildcard/mapper/WildcardFieldMapper.java | 4 +- 7 files changed, 19 insertions(+), 81 deletions(-) rename docs/reference/mapping/types/{wildcard-keyword.asciidoc => wildcard.asciidoc} (64%) delete mode 100644 x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/wildcard/WildcardFeatureSetUsage.java diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc index 2d10e8d92cc3e..aa767d76f7ab0 100644 --- a/docs/reference/mapping/types.asciidoc +++ b/docs/reference/mapping/types.asciidoc @@ -132,4 +132,4 @@ include::types/shape.asciidoc[] include::types/constant-keyword.asciidoc[] -include::types/wildcard-keyword.asciidoc[] +include::types/wildcard.asciidoc[] diff --git a/docs/reference/mapping/types/wildcard-keyword.asciidoc b/docs/reference/mapping/types/wildcard.asciidoc similarity index 64% rename from docs/reference/mapping/types/wildcard-keyword.asciidoc rename to docs/reference/mapping/types/wildcard.asciidoc index d965466d4ca37..88a08f2080bd8 100644 --- a/docs/reference/mapping/types/wildcard-keyword.asciidoc +++ b/docs/reference/mapping/types/wildcard.asciidoc @@ -1,17 +1,17 @@ [role="xpack"] [testenv="basic"] -[[wildcard-keyword]] -=== Wildcard keyword datatype +[[wildcard]] +=== Wildcard datatype ++++ -Wildcard keyword +Wildcard ++++ -A `wildcard_keyword` field stores values optimised for wildcard queries. +A `wildcard` field stores values optimised for wildcard queries. Wildcard queries are possible on other field types but suffer from constraints: * `text` fields limit matching of any wildcard expressions to individual tokens rather than the original whole value held in a field * `keyword` fields are untokenized but slow at performing wildcard queries (especially patterns with leading wildcards). -Internally the `wildcard_keyword` field indexes the whole field value using ngrams and stores the full string in compressed chunks. +Internally the `wildcard` field indexes the whole field value using ngrams and stores the full string in compressed chunks. The index is used as a rough filter to cut down the number of values that are then checked by retrieving and checking the full values from the compressed store. Storage costs are typically lower than those of `keyword` fields @@ -24,7 +24,7 @@ PUT my_index "mappings": { "properties": { "my_wildcard": { - "type": "wildcard_keyword" + "type": "wildcard" } } } @@ -48,6 +48,6 @@ POST my_index/_doc/_search ==== Limitations -* `wildcard_keyword` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries. -* `wildcard_keyword` fields cannot be used as a value source in aggregations such as the `terms` aggregation. +* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries. +* `wildcard` fields cannot be used as a value source in aggregations such as the `terms` aggregation. diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackInfoFeatureAction.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackInfoFeatureAction.java index dae576f3eaef8..0d97119434cc3 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackInfoFeatureAction.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackInfoFeatureAction.java @@ -36,7 +36,6 @@ public class XPackInfoFeatureAction extends ActionType public static final XPackInfoFeatureAction CCR = new XPackInfoFeatureAction(XPackField.CCR); public static final XPackInfoFeatureAction TRANSFORM = new XPackInfoFeatureAction(XPackField.TRANSFORM); public static final XPackInfoFeatureAction VECTORS = new XPackInfoFeatureAction(XPackField.VECTORS); - public static final XPackInfoFeatureAction WILDCARD = new XPackInfoFeatureAction(XPackField.WILDCARD); public static final XPackInfoFeatureAction VOTING_ONLY = new XPackInfoFeatureAction(XPackField.VOTING_ONLY); public static final XPackInfoFeatureAction FROZEN_INDICES = new XPackInfoFeatureAction(XPackField.FROZEN_INDICES); public static final XPackInfoFeatureAction SPATIAL = new XPackInfoFeatureAction(XPackField.SPATIAL); diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackUsageFeatureAction.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackUsageFeatureAction.java index afa3abe7fbb20..c696fdeaa3e29 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackUsageFeatureAction.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackUsageFeatureAction.java @@ -36,7 +36,6 @@ public class XPackUsageFeatureAction extends ActionType Date: Wed, 4 Mar 2020 11:56:44 +0000 Subject: [PATCH 14/32] Added REST tests for sorting and aggs --- .../test/wildcard/10_wildcard_basic.yml | 55 ++++++++++++++++++- .../mapper/WildcardOnBinaryDvQuery.java | 2 - 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml index 45327b2607f82..c6ef7f969ba4d 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml @@ -20,6 +20,12 @@ setup: id: 1 body: my_wildcard: hello world + - do: + index: + index: test-index + id: 2 + body: + my_wildcard: goodbye world - do: indices.refresh: {} @@ -97,7 +103,7 @@ setup: my_wildcard: {value: "*ld" } - - match: {hits.total: 1} + - match: {hits.total: 2} --- "Long suffix query": @@ -174,4 +180,51 @@ setup: - match: {hits.total: 0} +--- +"Aggs work": + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + wildcard: + my_wildcard: {value: "*world*" } + aggs: + top_vals: + terms: {field: "my_wildcard" } + + + - match: {hits.total: 2} + - length: { aggregations.top_vals.buckets: 2 } + +--- +"Sort works": + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + sort: [ { "my_wildcard": "desc" } ] + + - match: { hits.total: 2 } + - length: { hits.hits: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "2" } + + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + sort: [ { "my_wildcard": "asc" } ] + + - match: { hits.total: 2 } + - length: { hits.hits: 2 } + - match: { hits.hits.0._id: "2" } + - match: { hits.hits.1._id: "1" } + diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java index 66f9e9159d5fa..a37b193da116e 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java @@ -9,7 +9,6 @@ import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.Term; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreWeight; import org.apache.lucene.search.IndexSearcher; @@ -18,7 +17,6 @@ import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.Weight; -import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; From 377f81f4260c72bfbaace0aa539be9b1cd20d62c Mon Sep 17 00:00:00 2001 From: markharwood Date: Wed, 4 Mar 2020 12:04:58 +0000 Subject: [PATCH 15/32] Fix invalid docs reference --- docs/reference/mapping/types.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc index aa767d76f7ab0..d26d897474deb 100644 --- a/docs/reference/mapping/types.asciidoc +++ b/docs/reference/mapping/types.asciidoc @@ -7,7 +7,7 @@ document: [float] === Core datatypes -string:: <>, <> and <> +string:: <>, <> and <> <>:: `long`, `integer`, `short`, `byte`, `double`, `float`, `half_float`, `scaled_float` <>:: `date` <>:: `date_nanos` From 095d0f3ca731a20c46e0a552cc75d342e9057cf1 Mon Sep 17 00:00:00 2001 From: markharwood Date: Thu, 5 Mar 2020 10:59:50 +0000 Subject: [PATCH 16/32] Renamed WildcardOnBinaryDVQuery to AutomatonQueryOnBinaryDV. Changed TaperedNGramTokenFilter to TaperedNGramTokenizer. --- ...ery.java => AutomatonQueryOnBinaryDv.java} | 18 +++--- ...Filter.java => TaperedNgramTokenizer.java} | 61 +++++++++---------- .../wildcard/mapper/WildcardFieldMapper.java | 9 ++- ...s.java => TaperedNGramTokenizerTests.java} | 44 +++++-------- 4 files changed, 55 insertions(+), 77 deletions(-) rename x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/{WildcardOnBinaryDvQuery.java => AutomatonQueryOnBinaryDv.java} (84%) rename x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/{TaperedNgramTokenFilter.java => TaperedNgramTokenizer.java} (64%) rename x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/{TaperedNgramTokenFilterTests.java => TaperedNGramTokenizerTests.java} (51%) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java similarity index 84% rename from x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java rename to x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java index a37b193da116e..01ecf339358b1 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardOnBinaryDvQuery.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java @@ -25,18 +25,18 @@ import java.util.Objects; /** - * Query that runs a wildcard pattern across all binary doc values. + * Query that runs an Automaton across all binary doc values. * Expensive to run so normally used in conjunction with more selective query clauses. */ -public class WildcardOnBinaryDvQuery extends Query { +public class AutomatonQueryOnBinaryDv extends Query { private final String field; - private final String wildcardPattern; + private final String matchPattern; private Automaton automaton; - public WildcardOnBinaryDvQuery(String field, String wildcardPattern, Automaton automaton) { + public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) { this.field = field; - this.wildcardPattern = wildcardPattern; + this.matchPattern = matchPattern; this.automaton = automaton; } @@ -74,18 +74,18 @@ public boolean isCacheable(LeafReaderContext ctx) { } @Override public String toString(String field) { - return field+":"+wildcardPattern; + return field+":"+matchPattern; } @Override public boolean equals(Object obj) { - WildcardOnBinaryDvQuery other = (WildcardOnBinaryDvQuery) obj; - return Objects.equals(field, other.field) && Objects.equals(wildcardPattern, other.wildcardPattern); + AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj; + return Objects.equals(field, other.field) && Objects.equals(matchPattern, other.matchPattern); } @Override public int hashCode() { - return Objects.hash(field, wildcardPattern); + return Objects.hash(field, matchPattern); } } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilter.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java similarity index 64% rename from x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilter.java rename to x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java index 8084e4b0f95ca..08e69d35b7ac9 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilter.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java @@ -6,19 +6,15 @@ package org.elasticsearch.xpack.wildcard.mapper; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import java.io.IOException; -/** - * Variation on Lucene's NGramTokenFilter that uses smaller (1 character) ngrams for the final few characters in a string. Helps improve - * performance of short suffix queries e.g. "*.exe" - */ -public final class TaperedNgramTokenFilter extends TokenFilter { +public class TaperedNgramTokenizer extends Tokenizer { + private final int maxGram; private char[] curTermBuffer; @@ -31,34 +27,21 @@ public final class TaperedNgramTokenFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + - /** - * Creates a TaperedNgramTokenFilter that, for a given input term, produces all contained n-grams with length = maxGram. Will generate - * small ngrams from maxGram down to 1 for the end of the input token. - * - * Note: Care must be taken when choosing maxGram; depending on the input token size, this filter potentially produces a huge number of - * unique terms in the index. - * - * @param input - * {@link TokenStream} holding the input to be tokenized - * @param maxGram - * the maximum length of the generated n-grams (apart from those at tail) - */ - public TaperedNgramTokenFilter(TokenStream input, int maxGram) { - super(input); + public TaperedNgramTokenizer(int maxGram) { if (maxGram < 1) { throw new IllegalArgumentException("maxGram must be greater than zero"); } this.maxGram = maxGram; - } - + } + @Override - public boolean incrementToken() throws IOException { + public final boolean incrementToken() throws IOException { + clearAttributes(); while (true) { - if (curTermBuffer == null) { - if (!input.incrementToken()) { - return false; - } + if (curTermBuffer == null) { + loadBufferFromReader(); state = captureState(); curTermLength = termAtt.length(); @@ -83,12 +66,23 @@ public boolean incrementToken() throws IOException { posIncrAtt.setPositionIncrement(curPosIncr); return true; } - - // Done with this input token, get next token on next iteration. - curTermBuffer = null; + return false; } } - + + void loadBufferFromReader() throws IOException { + int upto = 0; + curTermBuffer = termAtt.buffer(); + while (true) { + final int length = input.read(curTermBuffer, upto, curTermBuffer.length-upto); + if (length == -1) break; + upto += length; + if (upto == curTermBuffer.length) + curTermBuffer = termAtt.resizeBuffer(1+curTermBuffer.length); + } + termAtt.setLength(upto); + } + @Override public void reset() throws IOException { super.reset(); @@ -100,5 +94,6 @@ public void reset() throws IOException { public void end() throws IOException { super.end(); posIncrAtt.setPositionIncrement(curPosIncr); - } + } + } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 606032f6d3412..fd9e63b3907c7 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -401,7 +401,7 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder(); verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST)); Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), wildcardPattern)); - verifyingBuilder.add(new BooleanClause(new WildcardOnBinaryDvQuery(name(), wildcardPattern, automaton), Occur.MUST)); + verifyingBuilder.add(new BooleanClause(new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton), Occur.MUST)); return verifyingBuilder.build(); } return approximation; @@ -516,11 +516,10 @@ void createFields(String value, Listfields) { if (value == null || value.length() > ignoreAbove) { return; } - KeywordTokenizer kt = new KeywordTokenizer(256); - kt.setReader(new StringReader(TOKEN_START_OR_END_CHAR+ value +TOKEN_START_OR_END_CHAR)); - TokenFilter filter = new TaperedNgramTokenFilter(kt, fieldType().numChars); + TaperedNgramTokenizer tokenizer = new TaperedNgramTokenizer(fieldType().numChars); + tokenizer.setReader(new StringReader(TOKEN_START_OR_END_CHAR+ value +TOKEN_START_OR_END_CHAR)); - Field field = new Field(fieldType().name(), filter, fieldType()); + Field field = new Field(fieldType().name(), tokenizer, fieldType()); fields.add(field); Field dvField = new BinaryDocValuesField(fieldType().name(), new BytesRef(value)); diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilterTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNGramTokenizerTests.java similarity index 51% rename from x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilterTests.java rename to x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNGramTokenizerTests.java index 345c5b3aabb72..b04985ea27473 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenFilterTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNGramTokenizerTests.java @@ -7,37 +7,34 @@ package org.elasticsearch.xpack.wildcard.mapper; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.elasticsearch.ElasticsearchParseException; -import org.elasticsearch.test.ESTestCase; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import java.io.IOException; import java.io.StringReader; +import java.util.Arrays; import static org.hamcrest.Matchers.equalTo; -public class TaperedNgramTokenFilterTests extends ESTestCase { +public class TaperedNGramTokenizerTests extends BaseTokenStreamTestCase { - public void testLongString() { + public void testLongString() throws IOException { checkTokens("Hello world", 6, "Hello ", "ello w", "llo wo", "lo wor", "o worl", " world", "world", "orld", "rld", "ld", "d"); } - public void testShortString() { + public void testShortString() throws IOException { checkTokens("Hello", 5, "Hello", "ello", "llo", "lo", "o"); } - public void testSingleCharDoc() { + public void testSingleCharDoc() throws IOException { checkTokens("H", 5, "H"); } - public void testSingleCharNgram() { + public void testSingleCharNgram() throws IOException { checkTokens("Hello", 1, "H", "e", "l", "l", "o"); } - public void testFieldMapperEncoding() { + public void testFieldMapperEncoding() throws IOException { char TOKEN_START_OR_END_CHAR = 0; checkTokens(TOKEN_START_OR_END_CHAR+"aaa"+TOKEN_START_OR_END_CHAR, 5, TOKEN_START_OR_END_CHAR+"aaa"+TOKEN_START_OR_END_CHAR, @@ -53,25 +50,12 @@ public void testTooShortNgram() { } - private static void checkTokens(String value, int ngramLength, String... expectedTokens) { - KeywordTokenizer kt = new KeywordTokenizer(256); - kt.setReader(new StringReader(value)); - TokenFilter filter = new TaperedNgramTokenFilter(kt, ngramLength); - CharTermAttribute termAtt = filter.addAttribute(CharTermAttribute.class); - int tokPos = 0; - try { - filter.reset(); - while (filter.incrementToken()) { - String expectedToken = expectedTokens[tokPos++]; - String actualToken = termAtt.toString(); - assertEquals(expectedToken, actualToken); - } - kt.end(); - kt.close(); - } catch (IOException ioe) { - throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment [" + value + "]"); - } - assertEquals(expectedTokens.length, tokPos); + private static void checkTokens(String value, int ngramLength, String... expectedTokens) throws IOException { + TaperedNgramTokenizer tokenizer = new TaperedNgramTokenizer(ngramLength); + tokenizer.setReader(new StringReader(value)); + int [] posIncs = new int[expectedTokens.length]; + Arrays.fill(posIncs, 1); + assertTokenStreamContents(tokenizer, expectedTokens, posIncs); } } From c41f208cf5c70ec8ec946d7859f05eea7a848f6b Mon Sep 17 00:00:00 2001 From: markharwood Date: Thu, 5 Mar 2020 11:41:56 +0000 Subject: [PATCH 17/32] Removed outdated limitation from docs --- docs/reference/mapping/types/wildcard.asciidoc | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/reference/mapping/types/wildcard.asciidoc b/docs/reference/mapping/types/wildcard.asciidoc index 88a08f2080bd8..10cdcfbeb89f6 100644 --- a/docs/reference/mapping/types/wildcard.asciidoc +++ b/docs/reference/mapping/types/wildcard.asciidoc @@ -49,5 +49,4 @@ POST my_index/_doc/_search ==== Limitations * `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries. -* `wildcard` fields cannot be used as a value source in aggregations such as the `terms` aggregation. From 39f248f0b984bf027e529ab2865f5be34e187716 Mon Sep 17 00:00:00 2001 From: markharwood Date: Thu, 5 Mar 2020 17:32:59 +0000 Subject: [PATCH 18/32] =?UTF-8?q?Addressed=20latest=20review=20comments=20?= =?UTF-8?q?apart=20from=20support=20for=20arrays.=20That=E2=80=99ll=20come?= =?UTF-8?q?=20in=20another=20commit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../reference/mapping/types/wildcard.asciidoc | 9 ++- .../plain/BinaryDVIndexFieldData.java | 2 +- .../test/wildcard/10_wildcard_basic.yml | 80 +++++++------------ .../mapper/AutomatonQueryOnBinaryDv.java | 2 +- .../mapper/TaperedNgramTokenizer.java | 9 +++ .../wildcard/mapper/WildcardFieldMapper.java | 12 +-- 6 files changed, 49 insertions(+), 65 deletions(-) diff --git a/docs/reference/mapping/types/wildcard.asciidoc b/docs/reference/mapping/types/wildcard.asciidoc index 10cdcfbeb89f6..51d10ff53ca92 100644 --- a/docs/reference/mapping/types/wildcard.asciidoc +++ b/docs/reference/mapping/types/wildcard.asciidoc @@ -6,14 +6,15 @@ Wildcard ++++ -A `wildcard` field stores values optimised for wildcard queries. +A `wildcard` field stores values optimised for wildcard grep-like queries. Wildcard queries are possible on other field types but suffer from constraints: * `text` fields limit matching of any wildcard expressions to individual tokens rather than the original whole value held in a field * `keyword` fields are untokenized but slow at performing wildcard queries (especially patterns with leading wildcards). -Internally the `wildcard` field indexes the whole field value using ngrams and stores the full string in compressed chunks. -The index is used as a rough filter to cut down the number of values that are then checked by retrieving and checking the full values from the compressed store. -Storage costs are typically lower than those of `keyword` fields +Internally the `wildcard` field indexes the whole field value using ngrams and stores the full string. +The index is used as a rough filter to cut down the number of values that are then checked by retrieving and checking the full values. +This field is especially well suited to run grep-like queries on log lines. Storage costs are typically lower than those of `keyword` +fields but search speeds for exact matches on full terms are slower. You index and search a wildcard field as follows diff --git a/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java b/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java index d9bd12f19522c..071bb3be8474d 100644 --- a/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java +++ b/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java @@ -53,7 +53,7 @@ public SortField sortField(@Nullable Object missingValue, MultiValueMode sortMod boolean reverse) { XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, sortMode, nested); - return new SortField(getFieldName(), source, reverse); + return new SortField(getFieldName(), source, reverse); } @Override diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml index c6ef7f969ba4d..5de3d824c6ac2 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml @@ -33,161 +33,139 @@ setup: --- "Short prefix query": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true query: wildcard: my_wildcard: {value: "hel*" } - - match: {hits.total: 1} + - match: {hits.total.value: 1} --- "Long prefix query": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true query: wildcard: my_wildcard: {value: "hello wor*" } - - match: {hits.total: 1} + - match: {hits.total.value: 1} --- "Short unrooted query": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true query: wildcard: my_wildcard: {value: "*ello*" } - - match: {hits.total: 1} + - match: {hits.total.value: 1} --- "Long unrooted query": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true query: wildcard: my_wildcard: {value: "*ello worl*" } - - match: {hits.total: 1} + - match: {hits.total.value: 1} --- "Short suffix query": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true query: wildcard: my_wildcard: {value: "*ld" } - - match: {hits.total: 2} + - match: {hits.total.value: 2} --- "Long suffix query": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true query: wildcard: my_wildcard: {value: "*ello world" } - - match: {hits.total: 1} + - match: {hits.total.value: 1} --- "No wildcard wildcard query": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true query: wildcard: my_wildcard: {value: "hello world" } - - match: {hits.total: 1} + - match: {hits.total.value: 1} --- "Term query on wildcard field": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true query: term: my_wildcard: "hello world" - - match: {hits.total: 1} + - match: {hits.total.value: 1} --- "Terms query on wildcard field": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true query: terms: my_wildcard: ["hello world", "does not exist"] - - match: {hits.total: 1} + - match: {hits.total.value: 1} --- "Sequence fail": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true query: wildcard: my_wildcard: {value: "*world*hello*" } - - match: {hits.total: 0} + - match: {hits.total.value: 0} --- "Aggs work": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true query: wildcard: my_wildcard: {value: "*world*" } @@ -196,33 +174,29 @@ setup: terms: {field: "my_wildcard" } - - match: {hits.total: 2} + - match: {hits.total.value: 2} - length: { aggregations.top_vals.buckets: 2 } --- "Sort works": - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true sort: [ { "my_wildcard": "desc" } ] - - match: { hits.total: 2 } - - length: { hits.hits: 2 } + - match: { hits.total.value: 2 } + - length: { hits.hits.value: 2 } - match: { hits.hits.0._id: "1" } - match: { hits.hits.1._id: "2" } - do: - headers: - Content-Type: application/json search: - rest_total_hits_as_int: true body: + track_total_hits: true sort: [ { "my_wildcard": "asc" } ] - - match: { hits.total: 2 } + - match: { hits.total.value: 2 } - length: { hits.hits: 2 } - match: { hits.hits.0._id: "2" } - match: { hits.hits.1._id: "1" } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java index 01ecf339358b1..ef3fe69c8aa47 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java @@ -54,7 +54,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { @Override public boolean matches() throws IOException { BytesRef value = values.binaryValue(); - return bytesMatcher.run(value.bytes, value.offset, value.length); + return bytesMatcher.run(value.bytes, value.offset, value.length); } @Override diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java index 08e69d35b7ac9..e294a1b0a7933 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java @@ -9,9 +9,12 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.index.IndexWriter; import java.io.IOException; +import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT; + public class TaperedNgramTokenizer extends Tokenizer { @@ -28,12 +31,15 @@ public class TaperedNgramTokenizer extends Tokenizer { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + /** Default read buffer size */ + public static final int DEFAULT_BUFFER_SIZE = 256; public TaperedNgramTokenizer(int maxGram) { if (maxGram < 1) { throw new IllegalArgumentException("maxGram must be greater than zero"); } this.maxGram = maxGram; + termAtt.resizeBuffer(DEFAULT_BUFFER_SIZE); } @Override @@ -77,6 +83,9 @@ void loadBufferFromReader() throws IOException { final int length = input.read(curTermBuffer, upto, curTermBuffer.length-upto); if (length == -1) break; upto += length; + if (upto > IndexWriter.MAX_TERM_LENGTH) { + throw new IllegalArgumentException("Provided value longer than Lucene maximum term length of " + IndexWriter.MAX_TERM_LENGTH ); + } if (upto == curTermBuffer.length) curTermBuffer = termAtt.resizeBuffer(1+curTermBuffer.length); } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index fd9e63b3907c7..6a23c18b745da 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -74,10 +74,11 @@ public static class Defaults { FIELD_TYPE.freeze(); } public static final int IGNORE_ABOVE = Integer.MAX_VALUE; + public static final int NUM_CHARS = 3; } public static class Builder extends FieldMapper.Builder { - private int numChars = 3; + private int numChars = Defaults.NUM_CHARS; protected int ignoreAbove = Defaults.IGNORE_ABOVE; public Builder(String name) { @@ -459,13 +460,11 @@ public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) { } private int ignoreAbove; - private int numChars; private WildcardFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, int ignoreAbove, int numChars, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); this.ignoreAbove = ignoreAbove; - this.numChars = numChars; assert fieldType.indexOptions() == IndexOptions.DOCS; } @@ -490,7 +489,9 @@ public WildcardFieldType fieldType() { @Override protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException { super.doXContentBody(builder, includeDefaults, params); - builder.field("num_chars", fieldType().numChars()); + if (includeDefaults || fieldType().numChars() != Defaults.NUM_CHARS) { + builder.field("num_chars", fieldType().numChars()); + } if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) { builder.field("ignore_above", ignoreAbove); } @@ -517,7 +518,7 @@ void createFields(String value, Listfields) { return; } TaperedNgramTokenizer tokenizer = new TaperedNgramTokenizer(fieldType().numChars); - tokenizer.setReader(new StringReader(TOKEN_START_OR_END_CHAR+ value +TOKEN_START_OR_END_CHAR)); + tokenizer.setReader(new StringReader(TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR)); Field field = new Field(fieldType().name(), tokenizer, fieldType()); fields.add(field); @@ -536,6 +537,5 @@ protected String contentType() { protected void doMerge(Mapper mergeWith) { super.doMerge(mergeWith); this.ignoreAbove = ((WildcardFieldMapper) mergeWith).ignoreAbove; - this.numChars = ((WildcardFieldMapper) mergeWith).numChars; } } From beabe03a2f2aca6e169c21aa09a3ef385f41ccc3 Mon Sep 17 00:00:00 2001 From: markharwood Date: Thu, 5 Mar 2020 18:08:09 +0000 Subject: [PATCH 19/32] Unused import --- .../xpack/wildcard/mapper/TaperedNgramTokenizer.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java index e294a1b0a7933..4fd591ee96f33 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java @@ -13,8 +13,6 @@ import java.io.IOException; -import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT; - public class TaperedNgramTokenizer extends Tokenizer { From e6bd8b0a7181e80d647b55c2260412bf5cab6b0d Mon Sep 17 00:00:00 2001 From: markharwood Date: Thu, 5 Mar 2020 18:16:38 +0000 Subject: [PATCH 20/32] Dammit. Line length --- .../xpack/wildcard/mapper/TaperedNgramTokenizer.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java index 4fd591ee96f33..98662c16597a5 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java @@ -82,7 +82,8 @@ void loadBufferFromReader() throws IOException { if (length == -1) break; upto += length; if (upto > IndexWriter.MAX_TERM_LENGTH) { - throw new IllegalArgumentException("Provided value longer than Lucene maximum term length of " + IndexWriter.MAX_TERM_LENGTH ); + throw new IllegalArgumentException("Provided value longer than Lucene maximum term length of " + + IndexWriter.MAX_TERM_LENGTH ); } if (upto == curTermBuffer.length) curTermBuffer = termAtt.resizeBuffer(1+curTermBuffer.length); From 488e64ae4062e783102c4346c8b380a211e51cb1 Mon Sep 17 00:00:00 2001 From: markharwood Date: Thu, 5 Mar 2020 20:10:34 +0000 Subject: [PATCH 21/32] Fix rest test bug --- .../resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml index 5de3d824c6ac2..557ad507d0e45 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml @@ -186,7 +186,7 @@ setup: sort: [ { "my_wildcard": "desc" } ] - match: { hits.total.value: 2 } - - length: { hits.hits.value: 2 } + - length: { hits.hits: 2 } - match: { hits.hits.0._id: "1" } - match: { hits.hits.1._id: "2" } From ecb021d9b6487bfe7a31db7284900efd100d479c Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 6 Mar 2020 15:13:18 +0000 Subject: [PATCH 22/32] Add support for prefix query. Set tokenised =false on elasticsearch-facing field type. Add Lucene-facing field type for ngram index --- .../test/wildcard/10_wildcard_basic.yml | 14 ++++++++++++ .../wildcard/mapper/WildcardFieldMapper.java | 22 +++++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml index 557ad507d0e45..c67a79c8218da 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml @@ -147,6 +147,20 @@ setup: - match: {hits.total.value: 1} +--- +"Prefix query on wildcard field": + - do: + search: + body: + track_total_hits: true + query: + prefix: + my_wildcard: + value: "hell*" + + + - match: {hits.total.value: 1} + --- "Sequence fail": - do: diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 6a23c18b745da..92091bef22aab 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -13,6 +13,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; @@ -67,7 +68,7 @@ public static class Defaults { public static final MappedFieldType FIELD_TYPE = new WildcardFieldType(); static { - FIELD_TYPE.setTokenized(true); + FIELD_TYPE.setTokenized(false); FIELD_TYPE.setIndexOptions(IndexOptions.DOCS); FIELD_TYPE.setStoreTermVectorOffsets(false); FIELD_TYPE.setOmitNorms(true); @@ -127,6 +128,7 @@ protected void setupFieldType(BuilderContext context) { super.setupFieldType(context); fieldType().setNumChars(numChars); fieldType().setHasDocValues(true); + fieldType().setTokenized(false); fieldType().setIndexOptions(IndexOptions.DOCS); } @@ -441,6 +443,11 @@ public Query existsQuery(QueryShardContext context) { public Query termQuery(Object value, QueryShardContext context) { return wildcardQuery(BytesRefs.toString(value), MultiTermQuery.CONSTANT_SCORE_REWRITE, context); } + + @Override + public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) { + return wildcardQuery(value + "*", method, context); + } @Override public Query termsQuery(List values, QueryShardContext context) { @@ -513,14 +520,25 @@ protected void parseCreateField(ParseContext context, List field createFields(value, fields); } + // For internal use by Lucene only - used to define ngram index + FieldType ngramFieldType = null; + void createFields(String value, Listfields) { if (value == null || value.length() > ignoreAbove) { return; } TaperedNgramTokenizer tokenizer = new TaperedNgramTokenizer(fieldType().numChars); tokenizer.setReader(new StringReader(TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR)); + + if (ngramFieldType == null) { + ngramFieldType = new FieldType(); + ngramFieldType.setTokenized(true); + ngramFieldType.setIndexOptions(IndexOptions.DOCS); + ngramFieldType.setOmitNorms(true); + ngramFieldType.freeze(); + } - Field field = new Field(fieldType().name(), tokenizer, fieldType()); + Field field = new Field(fieldType().name(), tokenizer, ngramFieldType); fields.add(field); Field dvField = new BinaryDocValuesField(fieldType().name(), new BytesRef(value)); From fa527dd669e62bf5571e292ab20ab806f67ff9b9 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 9 Mar 2020 14:58:18 +0000 Subject: [PATCH 23/32] Add support for multi fields --- .../mapper/AutomatonQueryOnBinaryDv.java | 19 +++++- .../wildcard/mapper/WildcardFieldMapper.java | 63 ++++++++++++++----- .../mapper/WildcardFieldMapperTests.java | 40 +++++++++--- 3 files changed, 96 insertions(+), 26 deletions(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java index ef3fe69c8aa47..e49f4a23b5140 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java @@ -17,9 +17,11 @@ import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.Weight; +import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; +import org.elasticsearch.index.mapper.BinaryFieldMapper; import java.io.IOException; import java.util.Objects; @@ -44,6 +46,8 @@ public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton aut public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton); + ByteArrayDataInput badi = new ByteArrayDataInput(); + return new ConstantScoreWeight(this, boost) { @@ -53,8 +57,19 @@ public Scorer scorer(LeafReaderContext context) throws IOException { TwoPhaseIterator twoPhase = new TwoPhaseIterator(values) { @Override public boolean matches() throws IOException { - BytesRef value = values.binaryValue(); - return bytesMatcher.run(value.bytes, value.offset, value.length); + BytesRef arrayOfValues = values.binaryValue(); + badi.reset(arrayOfValues.bytes); + badi.setPosition(arrayOfValues.offset); + + int size = badi.readVInt(); + for (int i=0; i< size; i++) { + int valLength = badi.readVInt(); + if (bytesMatcher.run(arrayOfValues.bytes, badi.getPosition(), valLength)) { + return true; + } + badi.skipBytes(valLength); + } + return false; } @Override diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 92091bef22aab..6918aab0bd93a 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -11,7 +11,6 @@ import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; @@ -26,9 +25,9 @@ import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.MultiTermQuery.RewriteMethod; import org.apache.lucene.search.Query; +import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.lucene.BytesRefs; @@ -37,14 +36,24 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.fielddata.IndexFieldData; -import org.elasticsearch.index.fielddata.plain.DocValuesIndexFieldData; +import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested; +import org.elasticsearch.index.fielddata.IndexFieldDataCache; +import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource; +import org.elasticsearch.index.fielddata.plain.BytesBinaryDVIndexFieldData; +import org.elasticsearch.index.mapper.BinaryFieldMapper.CustomBinaryDocValuesField; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.index.mapper.ParseContext.Document; import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.indices.breaker.CircuitBreakerService; +import org.elasticsearch.search.MultiValueMode; import java.io.IOException; import java.io.StringReader; @@ -173,10 +182,7 @@ public static class TypeParser implements Mapper.TypeParser { } } - public static final char TOKEN_START_OR_END_CHAR = 0; - // A visible character to aid debug -// public static final char TOKEN_START_OR_END_CHAR = '$'; - + public static final char TOKEN_START_OR_END_CHAR = 0; public static final class WildcardFieldType extends MappedFieldType { private int numChars; @@ -461,9 +467,29 @@ public Query termsQuery(List values, QueryShardContext context) { @Override public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) { failIfNoDocValues(); - return new DocValuesIndexFieldData.BinaryBuilder(); + return new IndexFieldData.Builder() { + + @Override + public IndexFieldData build(IndexSettings indexSettings, MappedFieldType fieldType, IndexFieldDataCache cache, + CircuitBreakerService breakerService, MapperService mapperService) { + return new WildcardBytesBinaryDVIndexFieldData(indexSettings.getIndex(), fieldType.name()); + }}; + } + } + + static class WildcardBytesBinaryDVIndexFieldData extends BytesBinaryDVIndexFieldData{ + + public WildcardBytesBinaryDVIndexFieldData(Index index, String fieldName) { + super(index, fieldName); } - + + @Override + public SortField sortField(Object missingValue, MultiValueMode sortMode, Nested nested, boolean reverse) { + XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, + sortMode, nested); + return new SortField(getFieldName(), source, reverse); + } + } private int ignoreAbove; @@ -517,13 +543,15 @@ protected void parseCreateField(ParseContext context, List field value = parser.textOrNull(); } } - createFields(value, fields); + ParseContext.Document parseDoc = context.doc(); + + createFields(value, parseDoc, fields); } // For internal use by Lucene only - used to define ngram index FieldType ngramFieldType = null; - void createFields(String value, Listfields) { + void createFields(String value, Document parseDoc, Listfields) { if (value == null || value.length() > ignoreAbove) { return; } @@ -538,11 +566,16 @@ void createFields(String value, Listfields) { ngramFieldType.freeze(); } - Field field = new Field(fieldType().name(), tokenizer, ngramFieldType); - fields.add(field); + Field ngramField = new Field(fieldType().name(), tokenizer, ngramFieldType); + fields.add(ngramField); - Field dvField = new BinaryDocValuesField(fieldType().name(), new BytesRef(value)); - fields.add(dvField); + CustomBinaryDocValuesField dvField = (CustomBinaryDocValuesField) parseDoc.getByKey(fieldType().name()); + if (dvField == null) { + dvField = new CustomBinaryDocValuesField(fieldType().name(), value.getBytes()); + parseDoc.addWithKey(fieldType().name(), dvField); + } else { + dvField.add(value.getBytes()); + } } @Override diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java index 1e5509aa04ba7..a820c4a2ee1bb 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -39,6 +39,7 @@ import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.ParseContext; import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.search.sort.FieldSortBuilder; import org.elasticsearch.test.ESTestCase; @@ -94,17 +95,19 @@ public void testTooBigKeywordField() throws IOException { IndexWriterConfig iwc = newIndexWriterConfig(); iwc.setMergePolicy(newTieredMergePolicy(random())); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - // Create a string that is too large and will not be indexed String docContent = randomABString(MAX_FIELD_LENGTH + 1); - createDocs(docContent, iw); + Document doc = new Document(); + ParseContext.Document parseDoc = new ParseContext.Document(); + addFields(parseDoc, doc, docContent); + indexDoc(parseDoc, doc, iw); + iw.forceMerge(1); DirectoryReader reader = iw.getReader(); IndexSearcher searcher = newSearcher(reader); iw.close(); - Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery("*a*", null, null); TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER); assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L)); @@ -123,11 +126,23 @@ public void testSearchResultsVersusKeywordField() throws IOException { int numDocs = 100; HashSet values = new HashSet<>(); for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + ParseContext.Document parseDoc = new ParseContext.Document(); String docContent = randomABString(1 + randomInt(MAX_FIELD_LENGTH - 1)); - if (values.contains(docContent) == false) { - createDocs(docContent, iw); + if (values.contains(docContent) == false) { + addFields(parseDoc, doc, docContent); values.add(docContent); } + // Occasionally add a multi-value field + if (randomBoolean()) { + docContent = randomABString(1 + randomInt(MAX_FIELD_LENGTH - 1)); + if (values.contains(docContent) == false) { + addFields(parseDoc, doc, docContent); + values.add(docContent); + } + } + indexDoc(parseDoc, doc, iw); + } iw.forceMerge(1); @@ -145,7 +160,7 @@ public void testSearchResultsVersusKeywordField() throws IOException { Query keywordFieldQuery = new WildcardQuery(new Term(KEYWORD_FIELD_NAME, randomWildcardPattern)); TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.INDEXORDER); - assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value)); + assertThat(kwTopDocs.totalHits.value, equalTo(wildcardFieldTopDocs.totalHits.value)); HashSet expectedDocs = new HashSet<>(); for (ScoreDoc topDoc : kwTopDocs.scoreDocs) { @@ -208,16 +223,23 @@ public MappedFieldType fieldMapper(String name) { }; } - private void createDocs(String docContent, RandomIndexWriter iw) throws IOException { + private void addFields(ParseContext.Document parseDoc, Document doc, String docContent) throws IOException { ArrayList fields = new ArrayList<>(); - wildcardFieldType.createFields(docContent, fields); - Document doc = new Document(); + wildcardFieldType.createFields(docContent, parseDoc, fields); + for (IndexableField indexableField : fields) { doc.add(indexableField); } // Add keyword fields too doc.add(new SortedSetDocValuesField(KEYWORD_FIELD_NAME, new BytesRef(docContent))); doc.add(new StringField(KEYWORD_FIELD_NAME, docContent, Field.Store.YES)); + } + + private void indexDoc(ParseContext.Document parseDoc, Document doc, RandomIndexWriter iw) throws IOException { + IndexableField field = parseDoc.getByKey(wildcardFieldType.name()); + if (field != null) { + doc.add(field); + } iw.addDocument(doc); } From 40c7929358beca861d8f95f09d93c87e7a405bb2 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 9 Mar 2020 15:25:25 +0000 Subject: [PATCH 24/32] Unused import --- .../xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java | 1 - 1 file changed, 1 deletion(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java index e49f4a23b5140..560087f6ddd1e 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java @@ -21,7 +21,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; -import org.elasticsearch.index.mapper.BinaryFieldMapper; import java.io.IOException; import java.util.Objects; From 6ccdc3b31b042b61d7b9cb1ea5b3ca31ee93aaef Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 9 Mar 2020 15:32:21 +0000 Subject: [PATCH 25/32] Checkstyle fix --- .../xpack/wildcard/mapper/WildcardFieldMapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 6918aab0bd93a..5b01a7c2a143f 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -479,7 +479,7 @@ public IndexFieldData build(IndexSettings indexSettings, MappedFieldType fiel static class WildcardBytesBinaryDVIndexFieldData extends BytesBinaryDVIndexFieldData{ - public WildcardBytesBinaryDVIndexFieldData(Index index, String fieldName) { + WildcardBytesBinaryDVIndexFieldData(Index index, String fieldName) { super(index, fieldName); } From 9e0b2b85f721e288585e92e42dafd92389b287a3 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 9 Mar 2020 15:55:27 +0000 Subject: [PATCH 26/32] Removed String.getBytes() --- .../xpack/wildcard/mapper/WildcardFieldMapper.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 5b01a7c2a143f..86a5ea11a6890 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -28,6 +28,7 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.lucene.BytesRefs; @@ -571,10 +572,10 @@ void createFields(String value, Document parseDoc, Listfields) { CustomBinaryDocValuesField dvField = (CustomBinaryDocValuesField) parseDoc.getByKey(fieldType().name()); if (dvField == null) { - dvField = new CustomBinaryDocValuesField(fieldType().name(), value.getBytes()); + dvField = new CustomBinaryDocValuesField(fieldType().name(), new BytesRef(value).bytes); parseDoc.addWithKey(fieldType().name(), dvField); } else { - dvField.add(value.getBytes()); + dvField.add(new BytesRef(value).bytes); } } From 62553477d49061635eac20dc94ed1738db69ae5d Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 9 Mar 2020 17:41:29 +0000 Subject: [PATCH 27/32] Bugfix - overly long byte arrays being serialised for field values. --- .../xpack/wildcard/mapper/WildcardFieldMapper.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 86a5ea11a6890..7b646e44b440e 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -58,6 +58,7 @@ import java.io.IOException; import java.io.StringReader; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -572,10 +573,10 @@ void createFields(String value, Document parseDoc, Listfields) { CustomBinaryDocValuesField dvField = (CustomBinaryDocValuesField) parseDoc.getByKey(fieldType().name()); if (dvField == null) { - dvField = new CustomBinaryDocValuesField(fieldType().name(), new BytesRef(value).bytes); + dvField = new CustomBinaryDocValuesField(fieldType().name(), value.getBytes(StandardCharsets.UTF_8)); parseDoc.addWithKey(fieldType().name(), dvField); } else { - dvField.add(new BytesRef(value).bytes); + dvField.add(value.getBytes(StandardCharsets.UTF_8)); } } From 93dbdd03b14f3e21212f91cf9c49cb3844508e5d Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 9 Mar 2020 18:57:04 +0000 Subject: [PATCH 28/32] Unused import --- .../elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java | 1 - 1 file changed, 1 deletion(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 7b646e44b440e..1bc34546e4b75 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -28,7 +28,6 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.lucene.BytesRefs; From ad132af7159dfd6aad2bb1e87c6f1f3820937597 Mon Sep 17 00:00:00 2001 From: markharwood Date: Tue, 10 Mar 2020 10:20:50 +0000 Subject: [PATCH 29/32] Added max clause protection and related test --- .../wildcard/mapper/WildcardFieldMapper.java | 35 ++++++++++++++----- .../mapper/WildcardFieldMapperTests.java | 29 +++++++++++++++ 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 1bc34546e4b75..4c8e172f95610 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -73,6 +73,7 @@ public class WildcardFieldMapper extends FieldMapper { public static final String CONTENT_TYPE = "wildcard"; public static short MAX_NUM_CHARS_COUNT = 6; //maximum allowed number of characters per ngram + public static short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10; public static class Defaults { public static final MappedFieldType FIELD_TYPE = new WildcardFieldType(); @@ -330,8 +331,7 @@ public boolean equals(Object obj) { public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) { PatternStructure patternStructure = new PatternStructure(wildcardPattern); - - BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); + ArrayList clauses = new ArrayList<>(); for (int i = 0; i < patternStructure.fragments.length; i++) { String fragment = patternStructure.fragments[i]; @@ -352,7 +352,7 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh if (fragment.length() == numChars) { TermQuery tq = new TermQuery(new Term(name(), fragment)); - bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + clauses.add(new BooleanClause(tq, Occur.MUST)); } else if (fragment.length() > numChars) { // Break fragment into multiple Ngrams KeywordTokenizer kt = new KeywordTokenizer(256); @@ -369,7 +369,7 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh while (filter.incrementToken()) { if (charPos == nextRequiredCoverage) { TermQuery tq = new TermQuery(new Term(name(), termAtt.toString())); - bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + clauses.add(new BooleanClause(tq, Occur.MUST)); nextRequiredCoverage = charPos + termAtt.length() - 1; } else { lastUnusedToken = termAtt.toString(); @@ -380,7 +380,7 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh // given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing // `ake` to complete the logic. TermQuery tq = new TermQuery(new Term(name(), lastUnusedToken)); - bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + clauses.add(new BooleanClause(tq, Occur.MUST)); } kt.end(); kt.close(); @@ -393,17 +393,17 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh // fragment occurs mid-string so will need a wildcard query WildcardQuery wq = new WildcardQuery(new Term(name(),fragment+"*")); wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE); - bqBuilder.add(new BooleanClause(wq, Occur.MUST)); + clauses.add(new BooleanClause(wq, Occur.MUST)); } else { // fragment occurs at end of string so can rely on Jim's indexing rule to optimise // *foo by indexing smaller ngrams at the end of a string TermQuery tq = new TermQuery(new Term(name(), fragment)); - bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + clauses.add(new BooleanClause(tq, Occur.MUST)); } } } - - BooleanQuery approximation = bqBuilder.build(); + + BooleanQuery approximation = createApproximationQuery(clauses); if (patternStructure.isMatchAll()) { return new MatchAllDocsQuery(); } @@ -417,6 +417,23 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh return approximation; } + private BooleanQuery createApproximationQuery(ArrayList clauses) { + BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); + if (clauses.size() <= MAX_CLAUSES_IN_APPROXIMATION_QUERY) { + for (BooleanClause booleanClause : clauses) { + bqBuilder.add(booleanClause); + } + return bqBuilder.build(); + } + // Thin out the number of clauses using a selection spread + // evenly across the range + float step = (float)(clauses.size() - 1) / (float)(MAX_CLAUSES_IN_APPROXIMATION_QUERY - 1); //set step size + for (int i=0; i Date: Wed, 11 Mar 2020 12:09:14 +0000 Subject: [PATCH 30/32] Removed TaperedNgramTokenizer and numChars. Changed encoding of terms to include double end char. Removed redundant BinaryBuilder. Thread safety fixes --- .../plain/BinaryDVIndexFieldData.java | 3 +- .../plain/DocValuesIndexFieldData.java | 11 -- .../mapper/AutomatonQueryOnBinaryDv.java | 5 +- .../mapper/TaperedNgramTokenizer.java | 107 ----------- .../wildcard/mapper/WildcardFieldMapper.java | 167 ++++++------------ .../mapper/TaperedNGramTokenizerTests.java | 61 ------- .../mapper/WildcardFieldTypeTests.java | 15 +- 7 files changed, 58 insertions(+), 311 deletions(-) delete mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java delete mode 100644 x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNGramTokenizerTests.java diff --git a/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java b/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java index 071bb3be8474d..06352640dc162 100644 --- a/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java +++ b/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java @@ -51,8 +51,7 @@ public BinaryDVAtomicFieldData loadDirect(LeafReaderContext context) throws Exce @Override public SortField sortField(@Nullable Object missingValue, MultiValueMode sortMode, XFieldComparatorSource.Nested nested, boolean reverse) { - XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, - sortMode, nested); + XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, sortMode, nested); return new SortField(getFieldName(), source, reverse); } diff --git a/server/src/main/java/org/elasticsearch/index/fielddata/plain/DocValuesIndexFieldData.java b/server/src/main/java/org/elasticsearch/index/fielddata/plain/DocValuesIndexFieldData.java index 039338299d82b..529bdb84b12ac 100644 --- a/server/src/main/java/org/elasticsearch/index/fielddata/plain/DocValuesIndexFieldData.java +++ b/server/src/main/java/org/elasticsearch/index/fielddata/plain/DocValuesIndexFieldData.java @@ -66,17 +66,6 @@ public final void clear(IndexReader reader) { public final Index index() { return index; } - - public static class BinaryBuilder implements IndexFieldData.Builder { - - @Override - public IndexFieldData build(IndexSettings indexSettings, MappedFieldType fieldType, IndexFieldDataCache cache, - CircuitBreakerService breakerService, MapperService mapperService) { - // Ignore Circuit Breaker - return new BinaryDVIndexFieldData(indexSettings.getIndex(), fieldType.name()); - } - - } public static class Builder implements IndexFieldData.Builder { private static final Set BINARY_INDEX_FIELD_NAMES = unmodifiableSet(newHashSet(IdFieldMapper.NAME)); diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java index 560087f6ddd1e..648fbc7e0cdc3 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java @@ -33,7 +33,7 @@ public class AutomatonQueryOnBinaryDv extends Query { private final String field; private final String matchPattern; - private Automaton automaton; + private final Automaton automaton; public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) { this.field = field; @@ -45,13 +45,12 @@ public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton aut public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton); - ByteArrayDataInput badi = new ByteArrayDataInput(); - return new ConstantScoreWeight(this, boost) { @Override public Scorer scorer(LeafReaderContext context) throws IOException { + ByteArrayDataInput badi = new ByteArrayDataInput(); final BinaryDocValues values = DocValues.getBinary(context.reader(), field); TwoPhaseIterator twoPhase = new TwoPhaseIterator(values) { @Override diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java deleted file mode 100644 index 98662c16597a5..0000000000000 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNgramTokenizer.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ - -package org.elasticsearch.xpack.wildcard.mapper; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.index.IndexWriter; - -import java.io.IOException; - -public class TaperedNgramTokenizer extends Tokenizer { - - - private final int maxGram; - - private char[] curTermBuffer; - private int curTermLength; - private int curTermCodePointCount; - private int curGramSize; - private int curPos; - private int curPosIncr; - private State state; - - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - - /** Default read buffer size */ - public static final int DEFAULT_BUFFER_SIZE = 256; - - public TaperedNgramTokenizer(int maxGram) { - if (maxGram < 1) { - throw new IllegalArgumentException("maxGram must be greater than zero"); - } - this.maxGram = maxGram; - termAtt.resizeBuffer(DEFAULT_BUFFER_SIZE); - } - - @Override - public final boolean incrementToken() throws IOException { - clearAttributes(); - while (true) { - if (curTermBuffer == null) { - loadBufferFromReader(); - state = captureState(); - - curTermLength = termAtt.length(); - curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); - curPosIncr += posIncrAtt.getPositionIncrement(); - curPos = -1; - - curTermBuffer = termAtt.buffer().clone(); - curGramSize = Math.min(curTermCodePointCount, maxGram); - } - curPos++; - if ( (curPos + curGramSize) > curTermCodePointCount) { - // Reached near the end of the string. Start tapering token size down to 1 - curGramSize = curTermCodePointCount - curPos; - } - if (curGramSize > 0) { - restoreState(state); - final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); - final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); - termAtt.copyBuffer(curTermBuffer, start, end - start); - - posIncrAtt.setPositionIncrement(curPosIncr); - return true; - } - return false; - } - } - - void loadBufferFromReader() throws IOException { - int upto = 0; - curTermBuffer = termAtt.buffer(); - while (true) { - final int length = input.read(curTermBuffer, upto, curTermBuffer.length-upto); - if (length == -1) break; - upto += length; - if (upto > IndexWriter.MAX_TERM_LENGTH) { - throw new IllegalArgumentException("Provided value longer than Lucene maximum term length of " - + IndexWriter.MAX_TERM_LENGTH ); - } - if (upto == curTermBuffer.length) - curTermBuffer = termAtt.resizeBuffer(1+curTermBuffer.length); - } - termAtt.setLength(upto); - } - - @Override - public void reset() throws IOException { - super.reset(); - curTermBuffer = null; - curPosIncr = 0; - } - - @Override - public void end() throws IOException { - super.end(); - posIncrAtt.setPositionIncrement(curPosIncr); - } - -} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 4c8e172f95610..b71e5c949a6b7 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -10,6 +10,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; @@ -62,7 +63,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Objects; import static org.elasticsearch.index.mapper.TypeParsers.parseField; @@ -72,8 +72,8 @@ public class WildcardFieldMapper extends FieldMapper { public static final String CONTENT_TYPE = "wildcard"; - public static short MAX_NUM_CHARS_COUNT = 6; //maximum allowed number of characters per ngram public static short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10; + public static final int NGRAM_SIZE = 3; public static class Defaults { public static final MappedFieldType FIELD_TYPE = new WildcardFieldType(); @@ -86,11 +86,9 @@ public static class Defaults { FIELD_TYPE.freeze(); } public static final int IGNORE_ABOVE = Integer.MAX_VALUE; - public static final int NUM_CHARS = 3; } public static class Builder extends FieldMapper.Builder { - private int numChars = Defaults.NUM_CHARS; protected int ignoreAbove = Defaults.IGNORE_ABOVE; public Builder(String name) { @@ -101,8 +99,7 @@ public Builder(String name) { @Override public Builder docValues(boolean docValues) { if (docValues == false) { - throw new MapperParsingException("The field [" + name + - "] cannot have doc values = false"); + throw new MapperParsingException("The field [" + name + "] cannot have doc values = false"); } return this; } @@ -110,20 +107,10 @@ public Builder docValues(boolean docValues) { @Override public Builder index(boolean index) { if (index == false) { - throw new MapperParsingException("The field [" + name + - "] cannot have index = false"); + throw new MapperParsingException("The field [" + name + "] cannot have index = false"); } return this; } - - public Builder numChars(int numChars) { - if ((numChars > MAX_NUM_CHARS_COUNT) || (numChars < 1)) { - throw new MapperParsingException("The number of characters for ngrams in field [" + name + - "] should be in the range [1, " + MAX_NUM_CHARS_COUNT + "]"); - } - this.numChars = numChars; - return this; - } public Builder ignoreAbove(int ignoreAbove) { if (ignoreAbove < 0) { @@ -137,7 +124,6 @@ public Builder ignoreAbove(int ignoreAbove) { @Override protected void setupFieldType(BuilderContext context) { super.setupFieldType(context); - fieldType().setNumChars(numChars); fieldType().setHasDocValues(true); fieldType().setTokenized(false); fieldType().setIndexOptions(IndexOptions.DOCS); @@ -152,7 +138,7 @@ public WildcardFieldType fieldType() { public WildcardFieldMapper build(BuilderContext context) { setupFieldType(context); return new WildcardFieldMapper( - name, fieldType, defaultFieldType, ignoreAbove, numChars, + name, fieldType, defaultFieldType, ignoreAbove, context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo); } } @@ -168,13 +154,7 @@ public static class TypeParser implements Mapper.TypeParser { Map.Entry entry = iterator.next(); String propName = entry.getKey(); Object propNode = entry.getValue(); - if (propName.equals("num_chars")) { - if (propNode == null) { - throw new MapperParsingException("Property [numChars] cannot be null."); - } - builder.numChars(XContentMapValues.nodeIntegerValue(propNode)); - iterator.remove(); - } else if (propName.equals("ignore_above")) { + if (propName.equals("ignore_above")) { builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1)); iterator.remove(); } @@ -187,7 +167,7 @@ public static class TypeParser implements Mapper.TypeParser { public static final char TOKEN_START_OR_END_CHAR = 0; public static final class WildcardFieldType extends MappedFieldType { - private int numChars; +// private int numChars; public WildcardFieldType() { setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); @@ -200,22 +180,10 @@ protected WildcardFieldType(WildcardFieldType ref) { public WildcardFieldType clone() { WildcardFieldType result = new WildcardFieldType(this); - result.setNumChars(numChars); return result; } - @Override - public int hashCode() { - return Objects.hash(super.hashCode(), numChars); - } - - @Override - public boolean equals(Object o) { - if (!super.equals(o)) return false; - WildcardFieldType that = (WildcardFieldType) o; - return numChars == that.numChars; - } - + // Holds parsed information about the wildcard pattern static class PatternStructure { boolean openStart, openEnd, hasSymbols; @@ -329,10 +297,9 @@ public boolean equals(Object obj) { @Override public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) { - PatternStructure patternStructure = new PatternStructure(wildcardPattern); - - ArrayList clauses = new ArrayList<>(); - + PatternStructure patternStructure = new PatternStructure(wildcardPattern); + ArrayList tokens = new ArrayList<>(); + for (int i = 0; i < patternStructure.fragments.length; i++) { String fragment = patternStructure.fragments[i]; int fLength = fragment.length(); @@ -347,29 +314,27 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh } if (patternStructure.openEnd == false && i == patternStructure.fragments.length - 1) { // End-of-string anchored (is not a trailing wildcard) - fragment = fragment + TOKEN_START_OR_END_CHAR; + fragment = fragment + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR; } - if (fragment.length() == numChars) { - TermQuery tq = new TermQuery(new Term(name(), fragment)); - clauses.add(new BooleanClause(tq, Occur.MUST)); - } else if (fragment.length() > numChars) { + if (fragment.length() <= NGRAM_SIZE) { + tokens.add(fragment); + } else if (fragment.length() > NGRAM_SIZE) { // Break fragment into multiple Ngrams KeywordTokenizer kt = new KeywordTokenizer(256); kt.setReader(new StringReader(fragment)); - TokenFilter filter = new NGramTokenFilter(kt, numChars, numChars, false); + TokenFilter filter = new NGramTokenFilter(kt, NGRAM_SIZE, NGRAM_SIZE, false); CharTermAttribute termAtt = filter.addAttribute(CharTermAttribute.class); String lastUnusedToken = null; try { filter.reset(); int nextRequiredCoverage = 0; int charPos = 0; - // minimise number of terms searched - eg for "1234567" and 4grams we only need terms - // `1234` and `4567` - no need to search for 2345 and 3456 - while (filter.incrementToken()) { + // minimise number of terms searched - eg for "12345" and 3grams we only need terms + // `123` and `345` - no need to search for 234 + while (filter.incrementToken()) { if (charPos == nextRequiredCoverage) { - TermQuery tq = new TermQuery(new Term(name(), termAtt.toString())); - clauses.add(new BooleanClause(tq, Occur.MUST)); + tokens.add(termAtt.toString()); nextRequiredCoverage = charPos + termAtt.length() - 1; } else { lastUnusedToken = termAtt.toString(); @@ -379,31 +344,17 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh if (lastUnusedToken != null) { // given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing // `ake` to complete the logic. - TermQuery tq = new TermQuery(new Term(name(), lastUnusedToken)); - clauses.add(new BooleanClause(tq, Occur.MUST)); + tokens.add(lastUnusedToken); } kt.end(); kt.close(); - } catch(IOException ioe) { - throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment ["+fragment+"]"); - } - } else { - // fragment is smaller than smallest ngram size - if (patternStructure.openEnd || i < patternStructure.fragments.length - 1) { - // fragment occurs mid-string so will need a wildcard query - WildcardQuery wq = new WildcardQuery(new Term(name(),fragment+"*")); - wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE); - clauses.add(new BooleanClause(wq, Occur.MUST)); - } else { - // fragment occurs at end of string so can rely on Jim's indexing rule to optimise - // *foo by indexing smaller ngrams at the end of a string - TermQuery tq = new TermQuery(new Term(name(), fragment)); - clauses.add(new BooleanClause(tq, Occur.MUST)); + } catch (IOException ioe) { + throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment [" + fragment + "]"); } } } - BooleanQuery approximation = createApproximationQuery(clauses); + BooleanQuery approximation = createApproximationQuery(tokens); if (patternStructure.isMatchAll()) { return new MatchAllDocsQuery(); } @@ -417,41 +368,34 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh return approximation; } - private BooleanQuery createApproximationQuery(ArrayList clauses) { + private BooleanQuery createApproximationQuery(ArrayList tokens) { BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); - if (clauses.size() <= MAX_CLAUSES_IN_APPROXIMATION_QUERY) { - for (BooleanClause booleanClause : clauses) { - bqBuilder.add(booleanClause); + if (tokens.size() <= MAX_CLAUSES_IN_APPROXIMATION_QUERY) { + for (String token : tokens) { + addClause(token, bqBuilder); } return bqBuilder.build(); } - // Thin out the number of clauses using a selection spread - // evenly across the range - float step = (float)(clauses.size() - 1) / (float)(MAX_CLAUSES_IN_APPROXIMATION_QUERY - 1); //set step size - for (int i=0; i conflicts) { - super.checkCompatibility(fieldType, conflicts); - WildcardFieldType other = (WildcardFieldType)fieldType; - // prevent user from changing num_chars - if (numChars() != other.numChars()) { - conflicts.add("mapper [" + name() + "] has different [num_chars]"); - } - } @Override public String typeName() { @@ -513,10 +457,16 @@ public SortField sortField(Object missingValue, MultiValueMode sortMode, Nested private int ignoreAbove; private WildcardFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, - int ignoreAbove, int numChars, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { + int ignoreAbove, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); this.ignoreAbove = ignoreAbove; assert fieldType.indexOptions() == IndexOptions.DOCS; + + ngramFieldType = new FieldType(); + ngramFieldType.setTokenized(true); + ngramFieldType.setIndexOptions(IndexOptions.DOCS); + ngramFieldType.setOmitNorms(true); + ngramFieldType.freeze(); } @@ -540,9 +490,6 @@ public WildcardFieldType fieldType() { @Override protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException { super.doXContentBody(builder, includeDefaults, params); - if (includeDefaults || fieldType().numChars() != Defaults.NUM_CHARS) { - builder.field("num_chars", fieldType().numChars()); - } if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) { builder.field("ignore_above", ignoreAbove); } @@ -567,22 +514,16 @@ protected void parseCreateField(ParseContext context, List field } // For internal use by Lucene only - used to define ngram index - FieldType ngramFieldType = null; + final FieldType ngramFieldType; void createFields(String value, Document parseDoc, Listfields) { if (value == null || value.length() > ignoreAbove) { return; } - TaperedNgramTokenizer tokenizer = new TaperedNgramTokenizer(fieldType().numChars); - tokenizer.setReader(new StringReader(TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR)); - - if (ngramFieldType == null) { - ngramFieldType = new FieldType(); - ngramFieldType.setTokenized(true); - ngramFieldType.setIndexOptions(IndexOptions.DOCS); - ngramFieldType.setOmitNorms(true); - ngramFieldType.freeze(); - } + NGramTokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE); + //encode end of string with double end char so that 3gram index can be search for single char end eg "*c" + tokenizer.setReader(new StringReader(TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR)); + Field ngramField = new Field(fieldType().name(), tokenizer, ngramFieldType); fields.add(ngramField); diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNGramTokenizerTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNGramTokenizerTests.java deleted file mode 100644 index b04985ea27473..0000000000000 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/TaperedNGramTokenizerTests.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ - - -package org.elasticsearch.xpack.wildcard.mapper; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; - -import java.io.IOException; -import java.io.StringReader; -import java.util.Arrays; - -import static org.hamcrest.Matchers.equalTo; - -public class TaperedNGramTokenizerTests extends BaseTokenStreamTestCase { - - - public void testLongString() throws IOException { - checkTokens("Hello world", 6, "Hello ", "ello w", "llo wo", "lo wor", "o worl", " world", "world", "orld", "rld", "ld", "d"); - } - - public void testShortString() throws IOException { - checkTokens("Hello", 5, "Hello", "ello", "llo", "lo", "o"); - } - - public void testSingleCharDoc() throws IOException { - checkTokens("H", 5, "H"); - } - - public void testSingleCharNgram() throws IOException { - checkTokens("Hello", 1, "H", "e", "l", "l", "o"); - } - - public void testFieldMapperEncoding() throws IOException { - char TOKEN_START_OR_END_CHAR = 0; - checkTokens(TOKEN_START_OR_END_CHAR+"aaa"+TOKEN_START_OR_END_CHAR, 5, - TOKEN_START_OR_END_CHAR+"aaa"+TOKEN_START_OR_END_CHAR, - "aaa"+TOKEN_START_OR_END_CHAR, - "aa"+TOKEN_START_OR_END_CHAR, - "a"+TOKEN_START_OR_END_CHAR, - ""+TOKEN_START_OR_END_CHAR); - } - - public void testTooShortNgram() { - Exception expectedException = expectThrows(IllegalArgumentException.class, () -> checkTokens("Hello", 0, "")); - assertThat(expectedException.getMessage(), equalTo("maxGram must be greater than zero")); - - } - - private static void checkTokens(String value, int ngramLength, String... expectedTokens) throws IOException { - TaperedNgramTokenizer tokenizer = new TaperedNgramTokenizer(ngramLength); - tokenizer.setReader(new StringReader(value)); - int [] posIncs = new int[expectedTokens.length]; - Arrays.fill(posIncs, 1); - assertTokenStreamContents(tokenizer, expectedTokens, posIncs); - } - -} diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java index b3c8f49f5343f..0d090e4a25058 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java @@ -9,24 +9,11 @@ import org.elasticsearch.index.mapper.FieldTypeTestCase; import org.elasticsearch.index.mapper.MappedFieldType; -import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper.WildcardFieldType; -import org.junit.Before; public class WildcardFieldTypeTests extends FieldTypeTestCase { @Override protected MappedFieldType createDefaultFieldType() { return new WildcardFieldMapper.WildcardFieldType(); - } - - @Before - public void setupProperties() { - addModifier(new Modifier("num_chars", false) { - @Override - public void modify(MappedFieldType ft) { - WildcardFieldType fieldType = (WildcardFieldType) ft; - fieldType.setNumChars(5); - } - }); - } + } } From 8435ec602c2740f4f2de31464065847eb28e9ff9 Mon Sep 17 00:00:00 2001 From: markharwood Date: Thu, 12 Mar 2020 11:15:17 +0000 Subject: [PATCH 31/32] =?UTF-8?q?Addressed=20Adrien=E2=80=99s=20review=20c?= =?UTF-8?q?omments=20(minus=20the=20use=20of=20custom=20Analyzer)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../wildcard/mapper/WildcardFieldMapper.java | 50 +++++++++---------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index b71e5c949a6b7..c5c90d44a9ac5 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -7,9 +7,6 @@ package org.elasticsearch.xpack.wildcard.mapper; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Field; @@ -167,7 +164,6 @@ public static class TypeParser implements Mapper.TypeParser { public static final char TOKEN_START_OR_END_CHAR = 0; public static final class WildcardFieldType extends MappedFieldType { -// private int numChars; public WildcardFieldType() { setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); @@ -316,48 +312,46 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh // End-of-string anchored (is not a trailing wildcard) fragment = fragment + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR; } - - if (fragment.length() <= NGRAM_SIZE) { + if (fragment.codePointCount(0, fragment.length()) <= NGRAM_SIZE) { tokens.add(fragment); - } else if (fragment.length() > NGRAM_SIZE) { + } else { // Break fragment into multiple Ngrams - KeywordTokenizer kt = new KeywordTokenizer(256); - kt.setReader(new StringReader(fragment)); - TokenFilter filter = new NGramTokenFilter(kt, NGRAM_SIZE, NGRAM_SIZE, false); - CharTermAttribute termAtt = filter.addAttribute(CharTermAttribute.class); + NGramTokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE); + tokenizer.setReader(new StringReader(fragment)); + CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); String lastUnusedToken = null; try { - filter.reset(); - int nextRequiredCoverage = 0; - int charPos = 0; + tokenizer.reset(); + boolean takeThis = true; // minimise number of terms searched - eg for "12345" and 3grams we only need terms - // `123` and `345` - no need to search for 234 - while (filter.incrementToken()) { - if (charPos == nextRequiredCoverage) { - tokens.add(termAtt.toString()); - nextRequiredCoverage = charPos + termAtt.length() - 1; + // `123` and `345` - no need to search for 234. We take every other ngram. + while (tokenizer.incrementToken()) { + String tokenValue = termAtt.toString(); + if (takeThis) { + tokens.add(tokenValue); } else { - lastUnusedToken = termAtt.toString(); + lastUnusedToken = tokenValue; } - charPos++; + // alternate + takeThis = !takeThis; } if (lastUnusedToken != null) { // given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing // `ake` to complete the logic. tokens.add(lastUnusedToken); } - kt.end(); - kt.close(); + tokenizer.end(); + tokenizer.close(); } catch (IOException ioe) { throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment [" + fragment + "]"); } } } - BooleanQuery approximation = createApproximationQuery(tokens); if (patternStructure.isMatchAll()) { return new MatchAllDocsQuery(); } + BooleanQuery approximation = createApproximationQuery(tokens); if (approximation.clauses().size() > 1 || patternStructure.needsVerification()) { BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder(); verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST)); @@ -381,12 +375,16 @@ private BooleanQuery createApproximationQuery(ArrayList tokens) { for (int i = 0; i < MAX_CLAUSES_IN_APPROXIMATION_QUERY; i++) { addClause(tokens.get(Math.round(step * i)), bqBuilder); // add each element of a position which is a multiple of step } + // TODO we can be smarter about pruning here. e.g. + // * Avoid wildcard queries if there are sufficient numbers of other terms that are full 3grams that are cheaper term queries + // * We can select terms on their scarcity rather than even spreads across the search string. + return bqBuilder.build(); } private void addClause(String token, BooleanQuery.Builder bqBuilder) { - assert token.length() <= NGRAM_SIZE; - if (token.length() == NGRAM_SIZE) { + assert token.codePointCount(0, token.length()) <= NGRAM_SIZE; + if (token.codePointCount(0, token.length()) == NGRAM_SIZE) { TermQuery tq = new TermQuery(new Term(name(), token)); bqBuilder.add(new BooleanClause(tq, Occur.MUST)); } else { From 9641b72a5b8235e1b16221468aacea6030f2248d Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 13 Mar 2020 14:01:13 +0000 Subject: [PATCH 32/32] Switched to reusing same Analyzer for all tokenisation. Added checks in Builder for invalid options, --- .../wildcard/mapper/WildcardFieldMapper.java | 60 +++++++++++++------ .../mapper/WildcardFieldMapperTests.java | 6 +- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index c5c90d44a9ac5..e489d8a35bb9f 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -7,10 +7,12 @@ package org.elasticsearch.xpack.wildcard.mapper; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; @@ -36,6 +38,8 @@ import org.elasticsearch.common.xcontent.support.XContentMapValues; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AnalyzerScope; +import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested; import org.elasticsearch.index.fielddata.IndexFieldDataCache; @@ -50,11 +54,11 @@ import org.elasticsearch.index.mapper.ParseContext; import org.elasticsearch.index.mapper.ParseContext.Document; import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.index.similarity.SimilarityProvider; import org.elasticsearch.indices.breaker.CircuitBreakerService; import org.elasticsearch.search.MultiValueMode; import java.io.IOException; -import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Iterator; @@ -71,12 +75,21 @@ public class WildcardFieldMapper extends FieldMapper { public static final String CONTENT_TYPE = "wildcard"; public static short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10; public static final int NGRAM_SIZE = 3; + static final NamedAnalyzer WILDCARD_ANALYZER = new NamedAnalyzer("_wildcard", AnalyzerScope.GLOBAL, new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE); + return new TokenStreamComponents(tokenizer); + } + }); public static class Defaults { public static final MappedFieldType FIELD_TYPE = new WildcardFieldType(); static { FIELD_TYPE.setTokenized(false); + FIELD_TYPE.setIndexAnalyzer(WILDCARD_ANALYZER); + FIELD_TYPE.setSearchAnalyzer(Lucene.KEYWORD_ANALYZER); FIELD_TYPE.setIndexOptions(IndexOptions.DOCS); FIELD_TYPE.setStoreTermVectorOffsets(false); FIELD_TYPE.setOmitNorms(true); @@ -100,7 +113,28 @@ public Builder docValues(boolean docValues) { } return this; } - + + @Override + public Builder indexOptions(IndexOptions indexOptions) { + if (indexOptions != IndexOptions.DOCS) { + throw new MapperParsingException("The field [" + name + "] cannot have indexOptions = " + indexOptions); + } + return this; + } + + @Override + public Builder store(boolean store) { + if (store) { + throw new MapperParsingException("The field [" + name + "] cannot have store = true"); + } + return this; + } + + @Override + public Builder similarity(SimilarityProvider similarity) { + throw new MapperParsingException("The field [" + name + "] cannot have custom similarities"); + } + @Override public Builder index(boolean index) { if (index == false) { @@ -316,8 +350,7 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QuerySh tokens.add(fragment); } else { // Break fragment into multiple Ngrams - NGramTokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE); - tokenizer.setReader(new StringReader(fragment)); + TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment); CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); String lastUnusedToken = null; try { @@ -460,14 +493,11 @@ private WildcardFieldMapper(String simpleName, MappedFieldType fieldType, Mapped this.ignoreAbove = ignoreAbove; assert fieldType.indexOptions() == IndexOptions.DOCS; - ngramFieldType = new FieldType(); - ngramFieldType.setTokenized(true); - ngramFieldType.setIndexOptions(IndexOptions.DOCS); - ngramFieldType.setOmitNorms(true); + ngramFieldType = fieldType.clone(); + ngramFieldType.setTokenized(true); ngramFieldType.freeze(); } - /** Values that have more chars than the return value of this method will * be skipped at parsing time. */ // pkg-private for testing @@ -512,18 +542,14 @@ protected void parseCreateField(ParseContext context, List field } // For internal use by Lucene only - used to define ngram index - final FieldType ngramFieldType; + final MappedFieldType ngramFieldType; void createFields(String value, Document parseDoc, Listfields) { if (value == null || value.length() > ignoreAbove) { return; } - NGramTokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE); - //encode end of string with double end char so that 3gram index can be search for single char end eg "*c" - tokenizer.setReader(new StringReader(TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR)); - - - Field ngramField = new Field(fieldType().name(), tokenizer, ngramFieldType); + String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR; + Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType); fields.add(ngramField); CustomBinaryDocValuesField dvField = (CustomBinaryDocValuesField) parseDoc.getByKey(fieldType().name()); diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java index 30a23a667e8c5..f8c8ddc7f5eef 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -93,7 +93,7 @@ public void testIllegalIndexedArgument() { public void testTooBigKeywordField() throws IOException { Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(); + IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER); iwc.setMergePolicy(newTieredMergePolicy(random())); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); @@ -120,7 +120,7 @@ public void testTooBigKeywordField() throws IOException { //Test long query strings don't cause exceptions public void testTooBigQueryField() throws IOException { Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(); + IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER); iwc.setMergePolicy(newTieredMergePolicy(random())); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); @@ -148,7 +148,7 @@ public void testTooBigQueryField() throws IOException { public void testSearchResultsVersusKeywordField() throws IOException { Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(); + IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER); iwc.setMergePolicy(newTieredMergePolicy(random())); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);