elastic · ywelsch · Feb 15, 2022 · Feb 9, 2022 · Feb 9, 2022 · Feb 9, 2022
diff --git a/docs/changelog/83738.yaml b/docs/changelog/83738.yaml
@@ -0,0 +1,6 @@
+pr: 83738
+summary: Check the utf8 length of keyword field is not bigger than 32766 in ES, rather than in Lucene.
+area: Mapping
+type: enhancement
+issues:
+  - 80865
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java
@@ -9,6 +9,7 @@
 package org.elasticsearch.index.mapper;
 
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.util.UnicodeUtil;
 import org.elasticsearch.Version;
 import org.elasticsearch.common.Explicit;
 import org.elasticsearch.common.TriFunction;
@@ -46,6 +47,8 @@
 import java.util.function.Function;
 import java.util.function.Supplier;
 
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
+
 public abstract class FieldMapper extends Mapper implements Cloneable {
     public static final Setting<Boolean> IGNORE_MALFORMED_SETTING = Setting.boolSetting(
         "index.mapping.ignore_malformed",
@@ -235,6 +238,14 @@ public void parse(DocumentParserContext context) throws IOException {
                     valuePreview = "null";
                 } else {
                     valuePreview = complexValue.toString();
+                    if (UnicodeUtil.UTF16toUTF8(
+                        valuePreview,
+                        0,
+                        valuePreview.length(),
+                        new byte[UnicodeUtil.maxUTF8Length(valuePreview.length())]
+                    ) > BYTE_BLOCK_SIZE - 2) {
+                        valuePreview = valuePreview.substring(0, 30) + "...";
+                    }
                 }
             } catch (Exception innerException) {
                 throw new MapperParsingException(

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -62,6 +62,7 @@
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
@@ -70,6 +71,8 @@
 import java.util.Objects;
 import java.util.function.Supplier;
 
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
+
 /**
  * A field mapper for keywords. This mapper accepts strings and indexes them as-is.
  */
@@ -911,6 +914,25 @@ private void indexValue(DocumentParserContext context, String value) {
 
         // convert to utf8 only once before feeding postings/dv/stored fields
         final BytesRef binaryValue = new BytesRef(value);
+
+        // make suer the utf8 length of keyword value is not bigger than 32766 in ES, not in lucene.
+        // See https://github.com/elastic/elasticsearch/issues/80865 details.
+        if (binaryValue.length > BYTE_BLOCK_SIZE - 2) {
+            byte[] prefix = new byte[30];
+            System.arraycopy(binaryValue.bytes, binaryValue.offset, prefix, 0, 30);
+            String msg = "Document contains at least one immense term in field=\""
+                + fieldType().name()
+                + "\" (whose "
+                + "UTF8 encoding is longer than the max length "
+                + (BYTE_BLOCK_SIZE - 2)
+                + "), all of which were "
+                + "skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense "
+                + "term is: '"
+                + Arrays.toString(prefix)
+                + "...'";
+            throw new IllegalArgumentException(msg);
+        }
+
         if (fieldType.indexOptions() != IndexOptions.NONE || fieldType.stored()) {
             Field field = new KeywordField(fieldType().name(), binaryValue, fieldType);
             context.doc().add(field);

diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java
@@ -49,6 +49,7 @@
 import static java.util.Collections.singletonList;
 import static java.util.Collections.singletonMap;
 import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.instanceOf;
@@ -604,4 +605,17 @@ public void testDimensionInRoutingPath() throws IOException {
         );
         mapper.documentMapper().validate(settings, false);  // Doesn't throw
     }
+
+    public void testKeywordFieldUtf8LongThan32766() throws Exception {
+        DocumentMapper mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "keyword")));
+        StringBuilder stringBuilder = new StringBuilder(BYTE_BLOCK_SIZE);
+        for (int i = 0; i < BYTE_BLOCK_SIZE; i++) {
+            stringBuilder.append("a");
+        }
+        MapperParsingException e = expectThrows(
+            MapperParsingException.class,
+            () -> mapper.parse(source(b -> b.field("field", stringBuilder.toString())))
+        );
+        assertThat(e.getCause().getMessage(), containsString("UTF8 encoding is longer than the max length"));
+    }
 }