From 1da9416e3c56ee75174eddb161067e90d41f525b Mon Sep 17 00:00:00 2001 From: Itamar Syn-Hershko Date: Wed, 19 Apr 2017 16:45:52 +0300 Subject: [PATCH 1/4] [Feature] Adding a char_group tokenizer === Char Group Tokenizer The `char_group` tokenizer breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful for cases where a simple custom tokenization is desired, and the overhead of use of the <> is not acceptable. === Configuration The `char_group` tokenizer accepts one parameter: `tokenize_on_chars`:: A string containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a new token is started. Also supports escaped values like `\\n` and `\\f`, and in addition `\\s` to represent whitespace, `\\d` to represent digits and `\\w` to represent letters. Defaults to an empty list. === Example output ```The 2 QUICK Brown-Foxes jumped over the lazy dog's bone for $2``` When the configuration `\\s-:<>` is used for `tokenize_on_chars`, the above sentence would produce the following terms: ```[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone, for, $2 ]``` --- .../analysis/CharGroupTokenizerFactory.java | 125 ++++++++++++++++++ .../indices/analysis/AnalysisModule.java | 2 + .../CharGroupTokenizerFactoryTests.java | 69 ++++++++++ .../tokenizers/chargroup-tokenizer.asciidoc | 31 +++++ 4 files changed, 227 insertions(+) create mode 100644 core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java create mode 100644 core/src/test/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactoryTests.java create mode 100644 docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java new file mode 100644 index 0000000000000..b6857deb19065 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java @@ -0,0 +1,125 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + +import java.util.HashSet; +import java.util.Set; + +public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{ + + private final Set tokenizeOnChars = new HashSet<>(); + private boolean tokenizeOnSpace = false; + private boolean tokenizeOnLetter = false; + private boolean tokenizeOnDigit = false; + + public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + + char[] chars = parseCharsList(settings.get("tokenize_on_chars")); + if (chars != null) { + for (char c : chars) { + tokenizeOnChars.add((int) c); + } + } + } + + private char[] parseCharsList(final String s) { + char[] out = new char[256]; + int readPos = 0; + int len = s.length(); + int writePos = 0; + while (readPos < len) { + char c = s.charAt(readPos++); + if (c == '\\') { + if (readPos >= len) + throw new RuntimeException("Invalid escaped char in [" + s + "]"); + c = s.charAt(readPos++); + switch (c) { + case '\\': + c = '\\'; + break; + case 'n': + c = '\n'; + break; + case 't': + c = '\t'; + break; + case 'r': + c = '\r'; + break; + case 'b': + c = '\b'; + break; + case 'f': + c = '\f'; + break; + case 'u': + if (readPos + 3 >= len) + throw new RuntimeException("Invalid escaped char in [" + s + "]"); + c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16); + readPos += 4; + break; + case 's': + tokenizeOnSpace = true; + writePos++; + continue; + case 'd': + tokenizeOnDigit = true; + writePos++; + continue; + case 'w': + tokenizeOnLetter = true; + writePos++; + continue; + default: + throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]"); + } + } + out[writePos++] = c; + } + return out; + } + + @Override + public Tokenizer create() { + return new CharTokenizer() { + @Override + protected boolean isTokenChar(int c) { + if (tokenizeOnSpace && Character.isWhitespace(c)) { + return false; + } + if (tokenizeOnLetter && Character.isLetter(c)) { + return false; + } + if (tokenizeOnDigit && Character.isDigit(c)) { + return false; + } + // TODO also support PUNCTUATION and SYMBOL a la CharMatcher ? + return !tokenizeOnChars.contains(c); + } + }; + } +} diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 61950942e6076..5625607219c4a 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -86,6 +86,7 @@ import org.elasticsearch.index.analysis.LatvianAnalyzerProvider; import org.elasticsearch.index.analysis.LengthTokenFilterFactory; import org.elasticsearch.index.analysis.LetterTokenizerFactory; +import org.elasticsearch.index.analysis.CharGroupTokenizerFactory; import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory; import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider; import org.elasticsearch.index.analysis.LowerCaseTokenFilterFactory; @@ -276,6 +277,7 @@ private NamedRegistry> setupTokenizers(List$").build(); + Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), + null, name, settings).create(); + tokenizer.setReader(new StringReader("foo bar $34 test:test2")); + assertTokenStreamContents(tokenizer, new String[] {"foo", "bar", "34", "test", "test2"}); + } +} diff --git a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc new file mode 100644 index 0000000000000..b1970f18fa2db --- /dev/null +++ b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc @@ -0,0 +1,31 @@ +[[analysis-chargroup-tokenizer]] +=== Char Group Tokenizer + +The `char_group` tokenizer breaks text into terms whenever it encounters a +character which is in a defined set. It is mostly useful for cases where a simple +custom tokenization is desired, and the overhead of use of the <> +is not acceptable. + +[float] +=== Configuration + +The `char_group` tokenizer accepts one parameter: + +[horizontal] +`tokenize_on_chars`:: + A string containing a list of characters to tokenize the string on. Whenever a character + from this list is encountered, a new token is started. Also supports escaped values like `\\n` and `\\f`, + and in addition `\\s` to represent whitespace, `\\d` to represent digits and `\\w` to represent letters. + Defaults to an empty list. + +[float] +=== Example output + +```The 2 QUICK Brown-Foxes jumped over the lazy dog's bone for $2``` + +When the configuration `\\s-:<>` is used for `tokenize_on_chars`, the above sentence would produce the following terms: + +[source,text] +--------------------------- +[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone, for, $2 ] +--------------------------- From 9c8c4ea8457df3dafe6086bc468c9348dd72bd08 Mon Sep 17 00:00:00 2001 From: Itamar Syn-Hershko Date: Wed, 13 Sep 2017 13:08:41 +0300 Subject: [PATCH 2/4] Updating config --- .../analysis/CharGroupTokenizerFactory.java | 112 ++++++++++-------- 1 file changed, 60 insertions(+), 52 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java index b6857deb19065..406815cb9f3e9 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java @@ -34,73 +34,76 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{ private boolean tokenizeOnSpace = false; private boolean tokenizeOnLetter = false; private boolean tokenizeOnDigit = false; + private boolean tokenizeOnPunctuation = false; + private boolean tokenizeOnSymbol = false; public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); - char[] chars = parseCharsList(settings.get("tokenize_on_chars")); - if (chars != null) { - for (char c : chars) { - tokenizeOnChars.add((int) c); + for (final String c : settings.getAsArray(settings.get("tokenize_on_chars"))) { + if (c == null || c.length() == 0) { + throw new RuntimeException("tokenize_on_chars cannot contain empty characters"); } - } - } - private char[] parseCharsList(final String s) { - char[] out = new char[256]; - int readPos = 0; - int len = s.length(); - int writePos = 0; - while (readPos < len) { - char c = s.charAt(readPos++); - if (c == '\\') { - if (readPos >= len) - throw new RuntimeException("Invalid escaped char in [" + s + "]"); - c = s.charAt(readPos++); + if (c.length() == 1) { + tokenizeOnChars.add((int) c.charAt(0)); + } + else if (c.charAt(0) == '\\') { + tokenizeOnChars.add((int) parseEscapedChar(c)); + } else { switch (c) { - case '\\': - c = '\\'; - break; - case 'n': - c = '\n'; - break; - case 't': - c = '\t'; + case "letter": + tokenizeOnLetter = true; break; - case 'r': - c = '\r'; + case "digit": + tokenizeOnDigit = true; break; - case 'b': - c = '\b'; + case "whitespace": + tokenizeOnSpace = true; break; - case 'f': - c = '\f'; + case "punctuation": + tokenizeOnPunctuation = true; break; - case 'u': - if (readPos + 3 >= len) - throw new RuntimeException("Invalid escaped char in [" + s + "]"); - c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16); - readPos += 4; + case "symbol": + tokenizeOnSymbol = true; break; - case 's': - tokenizeOnSpace = true; - writePos++; - continue; - case 'd': - tokenizeOnDigit = true; - writePos++; - continue; - case 'w': - tokenizeOnLetter = true; - writePos++; - continue; default: - throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]"); + throw new RuntimeException("Invalid escaped char in [" + c + "]"); } } - out[writePos++] = c; } - return out; + } + + private char parseEscapedChar(final String s) { + int len = s.length(); + char c = s.charAt(0); + if (c == '\\') { + if (1 >= len) + throw new RuntimeException("Invalid escaped char in [" + s + "]"); + c = s.charAt(1); + switch (c) { + case '\\': + return '\\'; + case 'n': + return '\n'; + case 't': + return '\t'; + case 'r': + return '\r'; + case 'b': + return '\b'; + case 'f': + return '\f'; + case 'u': + if (4 >= len) + throw new RuntimeException("Invalid escaped char in [" + s + "]"); + return (char) Integer.parseInt(s.substring(1, 5), 16); + default: + throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]"); + } + } else { + throw new RuntimeException("Invalid escaped char [" + s + "]"); + } } @Override @@ -117,7 +120,12 @@ protected boolean isTokenChar(int c) { if (tokenizeOnDigit && Character.isDigit(c)) { return false; } - // TODO also support PUNCTUATION and SYMBOL a la CharMatcher ? + if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) { + return false; + } + if (tokenizeOnSymbol && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) { + return false; + } return !tokenizeOnChars.contains(c); } }; From 014e64c7167e8a81cbbe720847807c2baf99acfd Mon Sep 17 00:00:00 2001 From: Itamar Syn-Hershko Date: Wed, 13 Sep 2017 13:10:27 +0300 Subject: [PATCH 3/4] Fixing typo --- .../elasticsearch/index/analysis/CharGroupTokenizerFactory.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java index 406815cb9f3e9..2aec9ae749f05 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java @@ -123,7 +123,7 @@ protected boolean isTokenChar(int c) { if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) { return false; } - if (tokenizeOnSymbol && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) { + if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) { return false; } return !tokenizeOnChars.contains(c); From 752d0cfafdae6c5645ce6ea403ac0f483f188db1 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 18 May 2018 17:13:38 +0200 Subject: [PATCH 4/4] Move things to analysis/common. --- docs/reference/analysis/tokenizers.asciidoc | 7 ++ .../tokenizers/chargroup-tokenizer.asciidoc | 65 ++++++++++++++++--- .../common}/CharGroupTokenizerFactory.java | 12 ++-- .../analysis/common/CommonAnalysisPlugin.java | 1 + .../CharGroupTokenizerFactoryTests.java | 35 +++++----- 5 files changed, 92 insertions(+), 28 deletions(-) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/CharGroupTokenizerFactory.java (91%) rename {core/src/test/java/org/elasticsearch/index/analysis => modules/analysis-common/src/test/java/org/elasticsearch/analysis/common}/CharGroupTokenizerFactoryTests.java (71%) diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc index add0abdec0123..d6f15ded05fab 100644 --- a/docs/reference/analysis/tokenizers.asciidoc +++ b/docs/reference/analysis/tokenizers.asciidoc @@ -103,6 +103,11 @@ The `simple_pattern` tokenizer uses a regular expression to capture matching text as terms. It uses a restricted subset of regular expression features and is generally faster than the `pattern` tokenizer. +<>:: + +The `char_group` tokenizer is configurable through sets of characters to split +on, which is usually less expensive than running regular expressions. + <>:: The `simple_pattern_split` tokenizer uses the same restricted regular expression @@ -143,6 +148,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[] include::tokenizers/pattern-tokenizer.asciidoc[] +include::tokenizers/chargroup-tokenizer.asciidoc[] + include::tokenizers/simplepattern-tokenizer.asciidoc[] include::tokenizers/simplepatternsplit-tokenizer.asciidoc[] diff --git a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc index b1970f18fa2db..e6bf79b0e961f 100644 --- a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc @@ -13,19 +13,68 @@ The `char_group` tokenizer accepts one parameter: [horizontal] `tokenize_on_chars`:: - A string containing a list of characters to tokenize the string on. Whenever a character - from this list is encountered, a new token is started. Also supports escaped values like `\\n` and `\\f`, - and in addition `\\s` to represent whitespace, `\\d` to represent digits and `\\w` to represent letters. - Defaults to an empty list. + A list containing a list of characters to tokenize the string on. Whenever a character + from this list is encountered, a new token is started. This accepts either single + characters like eg. `-`, or character groups: `whitespace`, `letter`, `digit`, + `punctuation`, `symbol`. + [float] === Example output -```The 2 QUICK Brown-Foxes jumped over the lazy dog's bone for $2``` +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": { + "type": "char_group", + "tokenize_on_chars": [ + "whitespace", + "-", + "\n" + ] + }, + "text": "The QUICK brown-fox" +} +--------------------------- +// CONSOLE -When the configuration `\\s-:<>` is used for `tokenize_on_chars`, the above sentence would produce the following terms: +returns -[source,text] +[source,js] --------------------------- -[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone, for, $2 ] +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "QUICK", + "start_offset": 4, + "end_offset": 9, + "type": "word", + "position": 1 + }, + { + "token": "brown", + "start_offset": 10, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "fox", + "start_offset": 16, + "end_offset": 19, + "type": "word", + "position": 3 + } + ] +} --------------------------- +// TESTRESPONSE + diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java similarity index 91% rename from core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java index 2aec9ae749f05..d4e1e794a309b 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java @@ -17,13 +17,14 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.util.CharTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenizerFactory; import java.util.HashSet; import java.util.Set; @@ -40,9 +41,9 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{ public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); - for (final String c : settings.getAsArray(settings.get("tokenize_on_chars"))) { + for (final String c : settings.getAsList("tokenize_on_chars")) { if (c == null || c.length() == 0) { - throw new RuntimeException("tokenize_on_chars cannot contain empty characters"); + throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters"); } if (c.length() == 1) { @@ -95,9 +96,10 @@ private char parseEscapedChar(final String s) { case 'f': return '\f'; case 'u': - if (4 >= len) + if (len > 6) { throw new RuntimeException("Invalid escaped char in [" + s + "]"); - return (char) Integer.parseInt(s.substring(1, 5), 16); + } + return (char) Integer.parseInt(s.substring(2), 16); default: throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]"); } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 624194092a02e..02a4197fba94a 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -184,6 +184,7 @@ public Map> getTokenizers() { tokenizers.put("ngram", NGramTokenizerFactory::new); tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new); tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new); + tokenizers.put("char_group", CharGroupTokenizerFactory::new); tokenizers.put("classic", ClassicTokenizerFactory::new); tokenizers.put("letter", LetterTokenizerFactory::new); tokenizers.put("lowercase", LowerCaseTokenizerFactory::new); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java similarity index 71% rename from core/src/test/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactoryTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java index 04d772926c6e5..1447531aa8731 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.elasticsearch.common.settings.Settings; @@ -34,23 +34,28 @@ public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase { public void testParseTokenChars() { final Index index = new Index("test", "_na_"); - final String name = "cg"; final Settings indexSettings = newAnalysisSettingsBuilder().build(); IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); - for (String conf : Arrays.asList("\\v", "abc\\$")) { - final Settings settings = newAnalysisSettingsBuilder().put("tokenize_on_chars", conf).build(); - try { - new CharGroupTokenizerFactory(indexProperties, null, name, settings).create(); - fail(); - } catch (RuntimeException expected) { - // OK - } + final String name = "cg"; + for (String[] conf : Arrays.asList( + new String[] { "\\v" }, + new String[] { "\\u00245" }, + new String[] { "commas" }, + new String[] { "a", "b", "c", "\\$" })) { + final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf).build(); + expectThrows(RuntimeException.class, () -> new CharGroupTokenizerFactory(indexProperties, null, name, settings).create()); } - for (String conf : Arrays.asList("", "\\s", "abc", "abc\\s", "\\w", "foo\\d")) { - final Settings settings = newAnalysisSettingsBuilder().put("tokenize_on_chars", conf).build(); - indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); - + for (String[] conf : Arrays.asList( + new String[0], + new String[] { "\\n" }, + new String[] { "\\u0024" }, + new String[] { "whitespace" }, + new String[] { "a", "b", "c" }, + new String[] { "a", "b", "c", "\\r" }, + new String[] { "\\r" }, + new String[] { "f", "o", "o", "symbol" })) { + final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", Arrays.asList(conf)).build(); new CharGroupTokenizerFactory(indexProperties, null, name, settings).create(); // no exception } @@ -60,7 +65,7 @@ public void testTokenization() throws IOException { final Index index = new Index("test", "_na_"); final String name = "cg"; final Settings indexSettings = newAnalysisSettingsBuilder().build(); - final Settings settings = newAnalysisSettingsBuilder().put("tokenize_on_chars", "\\s:<>$").build(); + final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", "whitespace", ":", "\\u0024").build(); Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); tokenizer.setReader(new StringReader("foo bar $34 test:test2"));