From 1da9416e3c56ee75174eddb161067e90d41f525b Mon Sep 17 00:00:00 2001
From: Itamar Syn-Hershko <itamar@code972.com>
Date: Wed, 19 Apr 2017 16:45:52 +0300
Subject: [PATCH 1/4] [Feature] Adding a char_group tokenizer

=== Char Group Tokenizer

The `char_group` tokenizer breaks text into terms whenever it encounters
a
character which is in a defined set. It is mostly useful for cases where
a simple
custom tokenization is desired, and the overhead of use of the
<<analysis-pattern-tokenizer, `pattern` tokenizer>>
is not acceptable.

=== Configuration

The `char_group` tokenizer accepts one parameter:

`tokenize_on_chars`::
    A string containing a list of characters to tokenize the string on.
Whenever a character
    from this list is encountered, a new token is started. Also supports
escaped values like `\\n` and `\\f`,
    and in addition `\\s` to represent whitespace, `\\d` to represent
digits and `\\w` to represent letters.
    Defaults to an empty list.

=== Example output

```The 2 QUICK Brown-Foxes jumped over the lazy dog's bone for $2```

When the configuration `\\s-:<>` is used for `tokenize_on_chars`, the
above sentence would produce the following terms:

```[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone,
for, $2 ]```
---
 .../analysis/CharGroupTokenizerFactory.java   | 125 ++++++++++++++++++
 .../indices/analysis/AnalysisModule.java      |   2 +
 .../CharGroupTokenizerFactoryTests.java       |  69 ++++++++++
 .../tokenizers/chargroup-tokenizer.asciidoc   |  31 +++++
 4 files changed, 227 insertions(+)
 create mode 100644 core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java
 create mode 100644 core/src/test/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactoryTests.java
 create mode 100644 docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java
new file mode 100644
index 0000000000000..b6857deb19065
--- /dev/null
+++ b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.CharTokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+import java.util.HashSet;
+import java.util.Set;
+
+public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
+
+    private final Set<Integer> tokenizeOnChars = new HashSet<>();
+    private boolean tokenizeOnSpace = false;
+    private boolean tokenizeOnLetter = false;
+    private boolean tokenizeOnDigit = false;
+
+    public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+
+        char[] chars = parseCharsList(settings.get("tokenize_on_chars"));
+        if (chars != null) {
+            for (char c : chars) {
+                tokenizeOnChars.add((int) c);
+            }
+        }
+    }
+
+    private char[] parseCharsList(final String s) {
+        char[] out = new char[256];
+        int readPos = 0;
+        int len = s.length();
+        int writePos = 0;
+        while (readPos < len) {
+            char c = s.charAt(readPos++);
+            if (c == '\\') {
+                if (readPos >= len)
+                    throw new RuntimeException("Invalid escaped char in [" + s + "]");
+                c = s.charAt(readPos++);
+                switch (c) {
+                    case '\\':
+                        c = '\\';
+                        break;
+                    case 'n':
+                        c = '\n';
+                        break;
+                    case 't':
+                        c = '\t';
+                        break;
+                    case 'r':
+                        c = '\r';
+                        break;
+                    case 'b':
+                        c = '\b';
+                        break;
+                    case 'f':
+                        c = '\f';
+                        break;
+                    case 'u':
+                        if (readPos + 3 >= len)
+                            throw new RuntimeException("Invalid escaped char in [" + s + "]");
+                        c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16);
+                        readPos += 4;
+                        break;
+                    case 's':
+                        tokenizeOnSpace = true;
+                        writePos++;
+                        continue;
+                    case 'd':
+                        tokenizeOnDigit = true;
+                        writePos++;
+                        continue;
+                    case 'w':
+                        tokenizeOnLetter = true;
+                        writePos++;
+                        continue;
+                    default:
+                        throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]");
+                }
+            }
+            out[writePos++] = c;
+        }
+        return out;
+    }
+
+    @Override
+    public Tokenizer create() {
+        return new CharTokenizer() {
+            @Override
+            protected boolean isTokenChar(int c) {
+                if (tokenizeOnSpace && Character.isWhitespace(c)) {
+                    return false;
+                }
+                if (tokenizeOnLetter && Character.isLetter(c)) {
+                    return false;
+                }
+                if (tokenizeOnDigit && Character.isDigit(c)) {
+                    return false;
+                }
+                // TODO also support PUNCTUATION and SYMBOL a la CharMatcher ?
+                return !tokenizeOnChars.contains(c);
+            }
+        };
+    }
+}
diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
index 61950942e6076..5625607219c4a 100644
--- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
@@ -86,6 +86,7 @@
 import org.elasticsearch.index.analysis.LatvianAnalyzerProvider;
 import org.elasticsearch.index.analysis.LengthTokenFilterFactory;
 import org.elasticsearch.index.analysis.LetterTokenizerFactory;
+import org.elasticsearch.index.analysis.CharGroupTokenizerFactory;
 import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
 import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider;
 import org.elasticsearch.index.analysis.LowerCaseTokenFilterFactory;
@@ -276,6 +277,7 @@ private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<A
         tokenizers.register("PathHierarchy", PathHierarchyTokenizerFactory::new);
         tokenizers.register("keyword", KeywordTokenizerFactory::new);
         tokenizers.register("letter", LetterTokenizerFactory::new);
+        tokenizers.register("char_group", CharGroupTokenizerFactory::new);
         tokenizers.register("lowercase", LowerCaseTokenizerFactory::new);
         tokenizers.register("whitespace", WhitespaceTokenizerFactory::new);
         tokenizers.register("nGram", NGramTokenizerFactory::new);
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactoryTests.java
new file mode 100644
index 0000000000000..04d772926c6e5
--- /dev/null
+++ b/core/src/test/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactoryTests.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+import org.elasticsearch.test.IndexSettingsModule;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Arrays;
+
+
+public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {
+    public void testParseTokenChars() {
+        final Index index = new Index("test", "_na_");
+        final String name = "cg";
+        final Settings indexSettings = newAnalysisSettingsBuilder().build();
+        IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
+        for (String conf : Arrays.asList("\\v", "abc\\$")) {
+            final Settings settings = newAnalysisSettingsBuilder().put("tokenize_on_chars", conf).build();
+            try {
+                new CharGroupTokenizerFactory(indexProperties, null, name, settings).create();
+                fail();
+            } catch (RuntimeException expected) {
+                // OK
+            }
+        }
+
+        for (String conf : Arrays.asList("", "\\s", "abc", "abc\\s", "\\w", "foo\\d")) {
+            final Settings settings = newAnalysisSettingsBuilder().put("tokenize_on_chars", conf).build();
+            indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
+
+            new CharGroupTokenizerFactory(indexProperties, null, name, settings).create();
+            // no exception
+        }
+    }
+
+    public void testTokenization() throws IOException {
+        final Index index = new Index("test", "_na_");
+        final String name = "cg";
+        final Settings indexSettings = newAnalysisSettingsBuilder().build();
+        final Settings settings = newAnalysisSettingsBuilder().put("tokenize_on_chars", "\\s:<>$").build();
+        Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings),
+                null, name, settings).create();
+        tokenizer.setReader(new StringReader("foo bar $34 test:test2"));
+        assertTokenStreamContents(tokenizer, new String[] {"foo", "bar", "34", "test", "test2"});
+    }
+}
diff --git a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
new file mode 100644
index 0000000000000..b1970f18fa2db
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
@@ -0,0 +1,31 @@
+[[analysis-chargroup-tokenizer]]
+=== Char Group Tokenizer
+
+The `char_group` tokenizer breaks text into terms whenever it encounters a
+character which is in a defined set. It is mostly useful for cases where a simple
+custom tokenization is desired, and the overhead of use of the <<analysis-pattern-tokenizer, `pattern` tokenizer>>
+is not acceptable.
+
+[float]
+=== Configuration
+
+The `char_group` tokenizer accepts one parameter:
+
+[horizontal]
+`tokenize_on_chars`::
+    A string containing a list of characters to tokenize the string on. Whenever a character 
+    from this list is encountered, a new token is started. Also supports escaped values like `\\n` and `\\f`,
+    and in addition `\\s` to represent whitespace, `\\d` to represent digits and `\\w` to represent letters.
+    Defaults to an empty list.
+
+[float]
+=== Example output
+
+```The 2 QUICK Brown-Foxes jumped over the lazy dog's bone for $2```
+
+When the configuration `\\s-:<>` is used for `tokenize_on_chars`, the above sentence would produce the following terms:
+
+[source,text]
+---------------------------
+[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone, for, $2 ]
+---------------------------

From 9c8c4ea8457df3dafe6086bc468c9348dd72bd08 Mon Sep 17 00:00:00 2001
From: Itamar Syn-Hershko <itamar@code972.com>
Date: Wed, 13 Sep 2017 13:08:41 +0300
Subject: [PATCH 2/4] Updating config

---
 .../analysis/CharGroupTokenizerFactory.java   | 112 ++++++++++--------
 1 file changed, 60 insertions(+), 52 deletions(-)

diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java
index b6857deb19065..406815cb9f3e9 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java
@@ -34,73 +34,76 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
     private boolean tokenizeOnSpace = false;
     private boolean tokenizeOnLetter = false;
     private boolean tokenizeOnDigit = false;
+    private boolean tokenizeOnPunctuation = false;
+    private boolean tokenizeOnSymbol = false;
 
     public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
 
-        char[] chars = parseCharsList(settings.get("tokenize_on_chars"));
-        if (chars != null) {
-            for (char c : chars) {
-                tokenizeOnChars.add((int) c);
+        for (final String c : settings.getAsArray(settings.get("tokenize_on_chars"))) {
+            if (c == null || c.length() == 0) {
+                throw new RuntimeException("tokenize_on_chars cannot contain empty characters");
             }
-        }
-    }
 
-    private char[] parseCharsList(final String s) {
-        char[] out = new char[256];
-        int readPos = 0;
-        int len = s.length();
-        int writePos = 0;
-        while (readPos < len) {
-            char c = s.charAt(readPos++);
-            if (c == '\\') {
-                if (readPos >= len)
-                    throw new RuntimeException("Invalid escaped char in [" + s + "]");
-                c = s.charAt(readPos++);
+            if (c.length() == 1) {
+                tokenizeOnChars.add((int) c.charAt(0));
+            }
+            else if (c.charAt(0) == '\\') {
+                tokenizeOnChars.add((int) parseEscapedChar(c));
+            } else {
                 switch (c) {
-                    case '\\':
-                        c = '\\';
-                        break;
-                    case 'n':
-                        c = '\n';
-                        break;
-                    case 't':
-                        c = '\t';
+                    case "letter":
+                        tokenizeOnLetter = true;
                         break;
-                    case 'r':
-                        c = '\r';
+                    case "digit":
+                        tokenizeOnDigit = true;
                         break;
-                    case 'b':
-                        c = '\b';
+                    case "whitespace":
+                        tokenizeOnSpace = true;
                         break;
-                    case 'f':
-                        c = '\f';
+                    case "punctuation":
+                        tokenizeOnPunctuation = true;
                         break;
-                    case 'u':
-                        if (readPos + 3 >= len)
-                            throw new RuntimeException("Invalid escaped char in [" + s + "]");
-                        c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16);
-                        readPos += 4;
+                    case "symbol":
+                        tokenizeOnSymbol = true;
                         break;
-                    case 's':
-                        tokenizeOnSpace = true;
-                        writePos++;
-                        continue;
-                    case 'd':
-                        tokenizeOnDigit = true;
-                        writePos++;
-                        continue;
-                    case 'w':
-                        tokenizeOnLetter = true;
-                        writePos++;
-                        continue;
                     default:
-                        throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]");
+                        throw new RuntimeException("Invalid escaped char in [" + c + "]");
                 }
             }
-            out[writePos++] = c;
         }
-        return out;
+    }
+
+    private char parseEscapedChar(final String s) {
+        int len = s.length();
+        char c = s.charAt(0);
+        if (c == '\\') {
+            if (1 >= len)
+                throw new RuntimeException("Invalid escaped char in [" + s + "]");
+            c = s.charAt(1);
+            switch (c) {
+                case '\\':
+                    return '\\';
+                case 'n':
+                    return '\n';
+                case 't':
+                    return '\t';
+                case 'r':
+                    return '\r';
+                case 'b':
+                    return '\b';
+                case 'f':
+                    return '\f';
+                case 'u':
+                    if (4 >= len)
+                        throw new RuntimeException("Invalid escaped char in [" + s + "]");
+                    return (char) Integer.parseInt(s.substring(1, 5), 16);
+                default:
+                    throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]");
+            }
+        } else {
+            throw new RuntimeException("Invalid escaped char [" + s + "]");
+        }
     }
 
     @Override
@@ -117,7 +120,12 @@ protected boolean isTokenChar(int c) {
                 if (tokenizeOnDigit && Character.isDigit(c)) {
                     return false;
                 }
-                // TODO also support PUNCTUATION and SYMBOL a la CharMatcher ?
+                if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
+                    return false;
+                }
+                if (tokenizeOnSymbol && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
+                    return false;
+                }
                 return !tokenizeOnChars.contains(c);
             }
         };

From 014e64c7167e8a81cbbe720847807c2baf99acfd Mon Sep 17 00:00:00 2001
From: Itamar Syn-Hershko <itamar@code972.com>
Date: Wed, 13 Sep 2017 13:10:27 +0300
Subject: [PATCH 3/4] Fixing typo

---
 .../elasticsearch/index/analysis/CharGroupTokenizerFactory.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java
index 406815cb9f3e9..2aec9ae749f05 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java
@@ -123,7 +123,7 @@ protected boolean isTokenChar(int c) {
                 if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
                     return false;
                 }
-                if (tokenizeOnSymbol && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
+                if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) {
                     return false;
                 }
                 return !tokenizeOnChars.contains(c);

From 752d0cfafdae6c5645ce6ea403ac0f483f188db1 Mon Sep 17 00:00:00 2001
From: Adrien Grand <jpountz@gmail.com>
Date: Fri, 18 May 2018 17:13:38 +0200
Subject: [PATCH 4/4] Move things to analysis/common.

---
 docs/reference/analysis/tokenizers.asciidoc   |  7 ++
 .../tokenizers/chargroup-tokenizer.asciidoc   | 65 ++++++++++++++++---
 .../common}/CharGroupTokenizerFactory.java    | 12 ++--
 .../analysis/common/CommonAnalysisPlugin.java |  1 +
 .../CharGroupTokenizerFactoryTests.java       | 35 +++++-----
 5 files changed, 92 insertions(+), 28 deletions(-)
 rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/CharGroupTokenizerFactory.java (91%)
 rename {core/src/test/java/org/elasticsearch/index/analysis => modules/analysis-common/src/test/java/org/elasticsearch/analysis/common}/CharGroupTokenizerFactoryTests.java (71%)

diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc
index add0abdec0123..d6f15ded05fab 100644
--- a/docs/reference/analysis/tokenizers.asciidoc
+++ b/docs/reference/analysis/tokenizers.asciidoc
@@ -103,6 +103,11 @@ The `simple_pattern` tokenizer uses a regular expression to capture matching
 text as terms. It uses a restricted subset of regular expression features
 and is generally faster than the `pattern` tokenizer.
 
+<<analysis-chargroup-tokenizer,Char Group Tokenizer>>::
+
+The `char_group` tokenizer is configurable through sets of characters to split
+on, which is usually less expensive than running regular expressions.
+
 <<analysis-simplepatternsplit-tokenizer,Simple Pattern Split Tokenizer>>::
 
 The `simple_pattern_split` tokenizer uses the same restricted regular expression
@@ -143,6 +148,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[]
 
 include::tokenizers/pattern-tokenizer.asciidoc[]
 
+include::tokenizers/chargroup-tokenizer.asciidoc[]
+
 include::tokenizers/simplepattern-tokenizer.asciidoc[]
 
 include::tokenizers/simplepatternsplit-tokenizer.asciidoc[]
diff --git a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
index b1970f18fa2db..e6bf79b0e961f 100644
--- a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
@@ -13,19 +13,68 @@ The `char_group` tokenizer accepts one parameter:
 
 [horizontal]
 `tokenize_on_chars`::
-    A string containing a list of characters to tokenize the string on. Whenever a character 
-    from this list is encountered, a new token is started. Also supports escaped values like `\\n` and `\\f`,
-    and in addition `\\s` to represent whitespace, `\\d` to represent digits and `\\w` to represent letters.
-    Defaults to an empty list.
+    A list containing a list of characters to tokenize the string on. Whenever a character 
+    from this list is encountered, a new token is started. This accepts either single
+    characters like eg. `-`, or character groups: `whitespace`, `letter`, `digit`,
+    `punctuation`, `symbol`.
+
 
 [float]
 === Example output
 
-```The 2 QUICK Brown-Foxes jumped over the lazy dog's bone for $2```
+[source,js]
+---------------------------
+POST _analyze
+{
+  "tokenizer": {
+    "type": "char_group",
+    "tokenize_on_chars": [
+      "whitespace",
+      "-",
+      "\n"
+    ]
+  },
+  "text": "The QUICK brown-fox"
+}
+---------------------------
+// CONSOLE
 
-When the configuration `\\s-:<>` is used for `tokenize_on_chars`, the above sentence would produce the following terms:
+returns
 
-[source,text]
+[source,js]
 ---------------------------
-[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone, for, $2 ]
+{
+  "tokens": [
+    {
+      "token": "The",
+      "start_offset": 0,
+      "end_offset": 3,
+      "type": "word",
+      "position": 0
+    },
+    {
+      "token": "QUICK",
+      "start_offset": 4,
+      "end_offset": 9,
+      "type": "word",
+      "position": 1
+    },
+    {
+      "token": "brown",
+      "start_offset": 10,
+      "end_offset": 15,
+      "type": "word",
+      "position": 2
+    },
+    {
+      "token": "fox",
+      "start_offset": 16,
+      "end_offset": 19,
+      "type": "word",
+      "position": 3
+    }
+  ]
+}
 ---------------------------
+// TESTRESPONSE
+
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java
similarity index 91%
rename from core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java
rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java
index 2aec9ae749f05..d4e1e794a309b 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java
@@ -17,13 +17,14 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.util.CharTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 import java.util.HashSet;
 import java.util.Set;
@@ -40,9 +41,9 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
     public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
 
-        for (final String c : settings.getAsArray(settings.get("tokenize_on_chars"))) {
+        for (final String c : settings.getAsList("tokenize_on_chars")) {
             if (c == null || c.length() == 0) {
-                throw new RuntimeException("tokenize_on_chars cannot contain empty characters");
+                throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters");
             }
 
             if (c.length() == 1) {
@@ -95,9 +96,10 @@ private char parseEscapedChar(final String s) {
                 case 'f':
                     return '\f';
                 case 'u':
-                    if (4 >= len)
+                    if (len > 6) {
                         throw new RuntimeException("Invalid escaped char in [" + s + "]");
-                    return (char) Integer.parseInt(s.substring(1, 5), 16);
+                    }
+                    return (char) Integer.parseInt(s.substring(2), 16);
                 default:
                     throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]");
             }
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
index 624194092a02e..02a4197fba94a 100644
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -184,6 +184,7 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
         tokenizers.put("ngram", NGramTokenizerFactory::new);
         tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
         tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
+        tokenizers.put("char_group", CharGroupTokenizerFactory::new);
         tokenizers.put("classic", ClassicTokenizerFactory::new);
         tokenizers.put("letter", LetterTokenizerFactory::new);
         tokenizers.put("lowercase", LowerCaseTokenizerFactory::new);
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java
similarity index 71%
rename from core/src/test/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactoryTests.java
rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java
index 04d772926c6e5..1447531aa8731 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/CharGroupTokenizerFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.elasticsearch.common.settings.Settings;
@@ -34,23 +34,28 @@
 public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {
     public void testParseTokenChars() {
         final Index index = new Index("test", "_na_");
-        final String name = "cg";
         final Settings indexSettings = newAnalysisSettingsBuilder().build();
         IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
-        for (String conf : Arrays.asList("\\v", "abc\\$")) {
-            final Settings settings = newAnalysisSettingsBuilder().put("tokenize_on_chars", conf).build();
-            try {
-                new CharGroupTokenizerFactory(indexProperties, null, name, settings).create();
-                fail();
-            } catch (RuntimeException expected) {
-                // OK
-            }
+        final String name = "cg";
+        for (String[] conf : Arrays.asList(
+                new String[] { "\\v" },
+                new String[] { "\\u00245" },
+                new String[] { "commas" },
+                new String[] { "a", "b", "c", "\\$" })) {
+            final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf).build();
+            expectThrows(RuntimeException.class, () -> new CharGroupTokenizerFactory(indexProperties, null, name, settings).create());
         }
 
-        for (String conf : Arrays.asList("", "\\s", "abc", "abc\\s", "\\w", "foo\\d")) {
-            final Settings settings = newAnalysisSettingsBuilder().put("tokenize_on_chars", conf).build();
-            indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
-
+        for (String[] conf : Arrays.asList(
+                new String[0],
+                new String[] { "\\n" },
+                new String[] { "\\u0024" },
+                new String[] { "whitespace" },
+                new String[] { "a", "b", "c" },
+                new String[] { "a", "b", "c", "\\r" },
+                new String[] { "\\r" },
+                new String[] { "f", "o", "o", "symbol" })) {
+            final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", Arrays.asList(conf)).build();
             new CharGroupTokenizerFactory(indexProperties, null, name, settings).create();
             // no exception
         }
@@ -60,7 +65,7 @@ public void testTokenization() throws IOException {
         final Index index = new Index("test", "_na_");
         final String name = "cg";
         final Settings indexSettings = newAnalysisSettingsBuilder().build();
-        final Settings settings = newAnalysisSettingsBuilder().put("tokenize_on_chars", "\\s:<>$").build();
+        final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", "whitespace", ":", "\\u0024").build();
         Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings),
                 null, name, settings).create();
         tokenizer.setReader(new StringReader("foo bar $34 test:test2"));