-
Notifications
You must be signed in to change notification settings - Fork 25k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Feature] Adding a char_group tokenizer (#24186)
=== Char Group Tokenizer The `char_group` tokenizer breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful for cases where a simple custom tokenization is desired, and the overhead of use of the <<analysis-pattern-tokenizer, `pattern` tokenizer>> is not acceptable. === Configuration The `char_group` tokenizer accepts one parameter: `tokenize_on_chars`:: A string containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a new token is started. Also supports escaped values like `\\n` and `\\f`, and in addition `\\s` to represent whitespace, `\\d` to represent digits and `\\w` to represent letters. Defaults to an empty list. === Example output ```The 2 QUICK Brown-Foxes jumped over the lazy dog's bone for $2``` When the configuration `\\s-:<>` is used for `tokenize_on_chars`, the above sentence would produce the following terms: ```[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone, for, $2 ]```
- Loading branch information
1 parent
2cf0e4f
commit 2a88819
Showing
5 changed files
with
297 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
80 changes: 80 additions & 0 deletions
80
docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
[[analysis-chargroup-tokenizer]] | ||
=== Char Group Tokenizer | ||
|
||
The `char_group` tokenizer breaks text into terms whenever it encounters a | ||
character which is in a defined set. It is mostly useful for cases where a simple | ||
custom tokenization is desired, and the overhead of use of the <<analysis-pattern-tokenizer, `pattern` tokenizer>> | ||
is not acceptable. | ||
|
||
[float] | ||
=== Configuration | ||
|
||
The `char_group` tokenizer accepts one parameter: | ||
|
||
[horizontal] | ||
`tokenize_on_chars`:: | ||
A list containing a list of characters to tokenize the string on. Whenever a character | ||
from this list is encountered, a new token is started. This accepts either single | ||
characters like eg. `-`, or character groups: `whitespace`, `letter`, `digit`, | ||
`punctuation`, `symbol`. | ||
|
||
|
||
[float] | ||
=== Example output | ||
|
||
[source,js] | ||
--------------------------- | ||
POST _analyze | ||
{ | ||
"tokenizer": { | ||
"type": "char_group", | ||
"tokenize_on_chars": [ | ||
"whitespace", | ||
"-", | ||
"\n" | ||
] | ||
}, | ||
"text": "The QUICK brown-fox" | ||
} | ||
--------------------------- | ||
// CONSOLE | ||
|
||
returns | ||
|
||
[source,js] | ||
--------------------------- | ||
{ | ||
"tokens": [ | ||
{ | ||
"token": "The", | ||
"start_offset": 0, | ||
"end_offset": 3, | ||
"type": "word", | ||
"position": 0 | ||
}, | ||
{ | ||
"token": "QUICK", | ||
"start_offset": 4, | ||
"end_offset": 9, | ||
"type": "word", | ||
"position": 1 | ||
}, | ||
{ | ||
"token": "brown", | ||
"start_offset": 10, | ||
"end_offset": 15, | ||
"type": "word", | ||
"position": 2 | ||
}, | ||
{ | ||
"token": "fox", | ||
"start_offset": 16, | ||
"end_offset": 19, | ||
"type": "word", | ||
"position": 3 | ||
} | ||
] | ||
} | ||
--------------------------- | ||
// TESTRESPONSE | ||
|
135 changes: 135 additions & 0 deletions
135
...sis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.analysis.common; | ||
|
||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.util.CharTokenizer; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.env.Environment; | ||
import org.elasticsearch.index.IndexSettings; | ||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory; | ||
|
||
import java.util.HashSet; | ||
import java.util.Set; | ||
|
||
public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{ | ||
|
||
private final Set<Integer> tokenizeOnChars = new HashSet<>(); | ||
private boolean tokenizeOnSpace = false; | ||
private boolean tokenizeOnLetter = false; | ||
private boolean tokenizeOnDigit = false; | ||
private boolean tokenizeOnPunctuation = false; | ||
private boolean tokenizeOnSymbol = false; | ||
|
||
public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { | ||
super(indexSettings, name, settings); | ||
|
||
for (final String c : settings.getAsList("tokenize_on_chars")) { | ||
if (c == null || c.length() == 0) { | ||
throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters"); | ||
} | ||
|
||
if (c.length() == 1) { | ||
tokenizeOnChars.add((int) c.charAt(0)); | ||
} | ||
else if (c.charAt(0) == '\\') { | ||
tokenizeOnChars.add((int) parseEscapedChar(c)); | ||
} else { | ||
switch (c) { | ||
case "letter": | ||
tokenizeOnLetter = true; | ||
break; | ||
case "digit": | ||
tokenizeOnDigit = true; | ||
break; | ||
case "whitespace": | ||
tokenizeOnSpace = true; | ||
break; | ||
case "punctuation": | ||
tokenizeOnPunctuation = true; | ||
break; | ||
case "symbol": | ||
tokenizeOnSymbol = true; | ||
break; | ||
default: | ||
throw new RuntimeException("Invalid escaped char in [" + c + "]"); | ||
} | ||
} | ||
} | ||
} | ||
|
||
private char parseEscapedChar(final String s) { | ||
int len = s.length(); | ||
char c = s.charAt(0); | ||
if (c == '\\') { | ||
if (1 >= len) | ||
throw new RuntimeException("Invalid escaped char in [" + s + "]"); | ||
c = s.charAt(1); | ||
switch (c) { | ||
case '\\': | ||
return '\\'; | ||
case 'n': | ||
return '\n'; | ||
case 't': | ||
return '\t'; | ||
case 'r': | ||
return '\r'; | ||
case 'b': | ||
return '\b'; | ||
case 'f': | ||
return '\f'; | ||
case 'u': | ||
if (len > 6) { | ||
throw new RuntimeException("Invalid escaped char in [" + s + "]"); | ||
} | ||
return (char) Integer.parseInt(s.substring(2), 16); | ||
default: | ||
throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]"); | ||
} | ||
} else { | ||
throw new RuntimeException("Invalid escaped char [" + s + "]"); | ||
} | ||
} | ||
|
||
@Override | ||
public Tokenizer create() { | ||
return new CharTokenizer() { | ||
@Override | ||
protected boolean isTokenChar(int c) { | ||
if (tokenizeOnSpace && Character.isWhitespace(c)) { | ||
return false; | ||
} | ||
if (tokenizeOnLetter && Character.isLetter(c)) { | ||
return false; | ||
} | ||
if (tokenizeOnDigit && Character.isDigit(c)) { | ||
return false; | ||
} | ||
if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) { | ||
return false; | ||
} | ||
if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) { | ||
return false; | ||
} | ||
return !tokenizeOnChars.contains(c); | ||
} | ||
}; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
74 changes: 74 additions & 0 deletions
74
...ommon/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.analysis.common; | ||
|
||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.index.Index; | ||
import org.elasticsearch.index.IndexSettings; | ||
import org.elasticsearch.test.ESTokenStreamTestCase; | ||
import org.elasticsearch.test.IndexSettingsModule; | ||
|
||
import java.io.IOException; | ||
import java.io.StringReader; | ||
import java.util.Arrays; | ||
|
||
|
||
public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase { | ||
public void testParseTokenChars() { | ||
final Index index = new Index("test", "_na_"); | ||
final Settings indexSettings = newAnalysisSettingsBuilder().build(); | ||
IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); | ||
final String name = "cg"; | ||
for (String[] conf : Arrays.asList( | ||
new String[] { "\\v" }, | ||
new String[] { "\\u00245" }, | ||
new String[] { "commas" }, | ||
new String[] { "a", "b", "c", "\\$" })) { | ||
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf).build(); | ||
expectThrows(RuntimeException.class, () -> new CharGroupTokenizerFactory(indexProperties, null, name, settings).create()); | ||
} | ||
|
||
for (String[] conf : Arrays.asList( | ||
new String[0], | ||
new String[] { "\\n" }, | ||
new String[] { "\\u0024" }, | ||
new String[] { "whitespace" }, | ||
new String[] { "a", "b", "c" }, | ||
new String[] { "a", "b", "c", "\\r" }, | ||
new String[] { "\\r" }, | ||
new String[] { "f", "o", "o", "symbol" })) { | ||
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", Arrays.asList(conf)).build(); | ||
new CharGroupTokenizerFactory(indexProperties, null, name, settings).create(); | ||
// no exception | ||
} | ||
} | ||
|
||
public void testTokenization() throws IOException { | ||
final Index index = new Index("test", "_na_"); | ||
final String name = "cg"; | ||
final Settings indexSettings = newAnalysisSettingsBuilder().build(); | ||
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", "whitespace", ":", "\\u0024").build(); | ||
Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), | ||
null, name, settings).create(); | ||
tokenizer.setReader(new StringReader("foo bar $34 test:test2")); | ||
assertTokenStreamContents(tokenizer, new String[] {"foo", "bar", "34", "test", "test2"}); | ||
} | ||
} |