From 2a88819d31153fc3549ec9cd3ef5c51d659153b3 Mon Sep 17 00:00:00 2001 From: Itamar Syn-Hershko Date: Tue, 22 May 2018 17:26:31 +0300 Subject: [PATCH] [Feature] Adding a char_group tokenizer (#24186) === Char Group Tokenizer The `char_group` tokenizer breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful for cases where a simple custom tokenization is desired, and the overhead of use of the <> is not acceptable. === Configuration The `char_group` tokenizer accepts one parameter: `tokenize_on_chars`:: A string containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a new token is started. Also supports escaped values like `\\n` and `\\f`, and in addition `\\s` to represent whitespace, `\\d` to represent digits and `\\w` to represent letters. Defaults to an empty list. === Example output ```The 2 QUICK Brown-Foxes jumped over the lazy dog's bone for $2``` When the configuration `\\s-:<>` is used for `tokenize_on_chars`, the above sentence would produce the following terms: ```[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone, for, $2 ]``` --- docs/reference/analysis/tokenizers.asciidoc | 7 + .../tokenizers/chargroup-tokenizer.asciidoc | 80 +++++++++++ .../common/CharGroupTokenizerFactory.java | 135 ++++++++++++++++++ .../analysis/common/CommonAnalysisPlugin.java | 1 + .../CharGroupTokenizerFactoryTests.java | 74 ++++++++++ 5 files changed, 297 insertions(+) create mode 100644 docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java create mode 100644 modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc index add0abdec0123..d6f15ded05fab 100644 --- a/docs/reference/analysis/tokenizers.asciidoc +++ b/docs/reference/analysis/tokenizers.asciidoc @@ -103,6 +103,11 @@ The `simple_pattern` tokenizer uses a regular expression to capture matching text as terms. It uses a restricted subset of regular expression features and is generally faster than the `pattern` tokenizer. +<>:: + +The `char_group` tokenizer is configurable through sets of characters to split +on, which is usually less expensive than running regular expressions. + <>:: The `simple_pattern_split` tokenizer uses the same restricted regular expression @@ -143,6 +148,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[] include::tokenizers/pattern-tokenizer.asciidoc[] +include::tokenizers/chargroup-tokenizer.asciidoc[] + include::tokenizers/simplepattern-tokenizer.asciidoc[] include::tokenizers/simplepatternsplit-tokenizer.asciidoc[] diff --git a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc new file mode 100644 index 0000000000000..e6bf79b0e961f --- /dev/null +++ b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc @@ -0,0 +1,80 @@ +[[analysis-chargroup-tokenizer]] +=== Char Group Tokenizer + +The `char_group` tokenizer breaks text into terms whenever it encounters a +character which is in a defined set. It is mostly useful for cases where a simple +custom tokenization is desired, and the overhead of use of the <> +is not acceptable. + +[float] +=== Configuration + +The `char_group` tokenizer accepts one parameter: + +[horizontal] +`tokenize_on_chars`:: + A list containing a list of characters to tokenize the string on. Whenever a character + from this list is encountered, a new token is started. This accepts either single + characters like eg. `-`, or character groups: `whitespace`, `letter`, `digit`, + `punctuation`, `symbol`. + + +[float] +=== Example output + +[source,js] +--------------------------- +POST _analyze +{ + "tokenizer": { + "type": "char_group", + "tokenize_on_chars": [ + "whitespace", + "-", + "\n" + ] + }, + "text": "The QUICK brown-fox" +} +--------------------------- +// CONSOLE + +returns + +[source,js] +--------------------------- +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "QUICK", + "start_offset": 4, + "end_offset": 9, + "type": "word", + "position": 1 + }, + { + "token": "brown", + "start_offset": 10, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "fox", + "start_offset": 16, + "end_offset": 19, + "type": "word", + "position": 3 + } + ] +} +--------------------------- +// TESTRESPONSE + diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java new file mode 100644 index 0000000000000..d4e1e794a309b --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java @@ -0,0 +1,135 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenizerFactory; + +import java.util.HashSet; +import java.util.Set; + +public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{ + + private final Set tokenizeOnChars = new HashSet<>(); + private boolean tokenizeOnSpace = false; + private boolean tokenizeOnLetter = false; + private boolean tokenizeOnDigit = false; + private boolean tokenizeOnPunctuation = false; + private boolean tokenizeOnSymbol = false; + + public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + + for (final String c : settings.getAsList("tokenize_on_chars")) { + if (c == null || c.length() == 0) { + throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters"); + } + + if (c.length() == 1) { + tokenizeOnChars.add((int) c.charAt(0)); + } + else if (c.charAt(0) == '\\') { + tokenizeOnChars.add((int) parseEscapedChar(c)); + } else { + switch (c) { + case "letter": + tokenizeOnLetter = true; + break; + case "digit": + tokenizeOnDigit = true; + break; + case "whitespace": + tokenizeOnSpace = true; + break; + case "punctuation": + tokenizeOnPunctuation = true; + break; + case "symbol": + tokenizeOnSymbol = true; + break; + default: + throw new RuntimeException("Invalid escaped char in [" + c + "]"); + } + } + } + } + + private char parseEscapedChar(final String s) { + int len = s.length(); + char c = s.charAt(0); + if (c == '\\') { + if (1 >= len) + throw new RuntimeException("Invalid escaped char in [" + s + "]"); + c = s.charAt(1); + switch (c) { + case '\\': + return '\\'; + case 'n': + return '\n'; + case 't': + return '\t'; + case 'r': + return '\r'; + case 'b': + return '\b'; + case 'f': + return '\f'; + case 'u': + if (len > 6) { + throw new RuntimeException("Invalid escaped char in [" + s + "]"); + } + return (char) Integer.parseInt(s.substring(2), 16); + default: + throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]"); + } + } else { + throw new RuntimeException("Invalid escaped char [" + s + "]"); + } + } + + @Override + public Tokenizer create() { + return new CharTokenizer() { + @Override + protected boolean isTokenChar(int c) { + if (tokenizeOnSpace && Character.isWhitespace(c)) { + return false; + } + if (tokenizeOnLetter && Character.isLetter(c)) { + return false; + } + if (tokenizeOnDigit && Character.isDigit(c)) { + return false; + } + if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) { + return false; + } + if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) { + return false; + } + return !tokenizeOnChars.contains(c); + } + }; + } +} diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 624194092a02e..02a4197fba94a 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -184,6 +184,7 @@ public Map> getTokenizers() { tokenizers.put("ngram", NGramTokenizerFactory::new); tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new); tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new); + tokenizers.put("char_group", CharGroupTokenizerFactory::new); tokenizers.put("classic", ClassicTokenizerFactory::new); tokenizers.put("letter", LetterTokenizerFactory::new); tokenizers.put("lowercase", LowerCaseTokenizerFactory::new); diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java new file mode 100644 index 0000000000000..1447531aa8731 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java @@ -0,0 +1,74 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.Tokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.test.ESTokenStreamTestCase; +import org.elasticsearch.test.IndexSettingsModule; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Arrays; + + +public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase { + public void testParseTokenChars() { + final Index index = new Index("test", "_na_"); + final Settings indexSettings = newAnalysisSettingsBuilder().build(); + IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); + final String name = "cg"; + for (String[] conf : Arrays.asList( + new String[] { "\\v" }, + new String[] { "\\u00245" }, + new String[] { "commas" }, + new String[] { "a", "b", "c", "\\$" })) { + final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf).build(); + expectThrows(RuntimeException.class, () -> new CharGroupTokenizerFactory(indexProperties, null, name, settings).create()); + } + + for (String[] conf : Arrays.asList( + new String[0], + new String[] { "\\n" }, + new String[] { "\\u0024" }, + new String[] { "whitespace" }, + new String[] { "a", "b", "c" }, + new String[] { "a", "b", "c", "\\r" }, + new String[] { "\\r" }, + new String[] { "f", "o", "o", "symbol" })) { + final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", Arrays.asList(conf)).build(); + new CharGroupTokenizerFactory(indexProperties, null, name, settings).create(); + // no exception + } + } + + public void testTokenization() throws IOException { + final Index index = new Index("test", "_na_"); + final String name = "cg"; + final Settings indexSettings = newAnalysisSettingsBuilder().build(); + final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", "whitespace", ":", "\\u0024").build(); + Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), + null, name, settings).create(); + tokenizer.setReader(new StringReader("foo bar $34 test:test2")); + assertTokenStreamContents(tokenizer, new String[] {"foo", "bar", "34", "test", "test2"}); + } +}