Merge branch 'main' into remove-TermInSet-getTermData-usage

cbuescher · Sep 19, 2024 · a1f6389 · a1f6389
2 parents 7ef6dc8 + 2d47cd7
commit a1f6389
Show file tree

Hide file tree

Showing 12 changed files with 338 additions and 63 deletions.
diff --git a/.buildkite/pipelines/periodic.yml b/.buildkite/pipelines/periodic.yml
@@ -584,7 +584,7 @@ steps:
       machineType: n2-standard-8
       buildDirectory: /dev/shm/bk
       diskSizeGb: 250
-    if: build.branch == "main" || build.branch == "7.17"
+    if: build.branch == "main" || build.branch == "8.x" || build.branch == "7.17"
   - label: check-branch-consistency
     command: .ci/scripts/run-gradle.sh branchConsistency
     timeout_in_minutes: 15

diff --git a/docs/changelog/113027.yaml b/docs/changelog/113027.yaml
@@ -0,0 +1,6 @@
+pr: 113027
+summary: Retrieve the source for objects and arrays in a separate parsing phase
+area: Mapping
+type: bug
+issues:
+ - 112374
diff --git a/...stTest/java/org/elasticsearch/datastreams/logsdb/qa/matchers/source/SourceTransforms.java b/...stTest/java/org/elasticsearch/datastreams/logsdb/qa/matchers/source/SourceTransforms.java
@@ -11,10 +11,10 @@
 
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
+import java.util.TreeMap;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 
@@ -32,7 +32,7 @@ class SourceTransforms {
      * @return flattened map
      */
     public static Map<String, List<Object>> normalize(Map<String, Object> map) {
-        var flattened = new HashMap<String, List<Object>>();
+        var flattened = new TreeMap<String, List<Object>>();
 
         descend(null, map, flattened);
 

diff --git a/muted-tests.yml b/muted-tests.yml
@@ -115,9 +115,6 @@ tests:
 - class: org.elasticsearch.search.retriever.RankDocRetrieverBuilderIT
   method: testRankDocsRetrieverWithCollapse
   issue: https://github.com/elastic/elasticsearch/issues/112254
-- class: org.elasticsearch.datastreams.logsdb.qa.StandardVersusLogsIndexModeRandomDataChallengeRestIT
-  method: testMatchAllQuery
-  issue: https://github.com/elastic/elasticsearch/issues/112374
 - class: org.elasticsearch.smoketest.DocsClientYamlTestSuiteIT
   method: test {yaml=reference/rest-api/watcher/put-watch/line_120}
   issue: https://github.com/elastic/elasticsearch/issues/99517
@@ -201,12 +198,6 @@ tests:
 - class: org.elasticsearch.xpack.ml.integration.MlJobIT
   method: testDeleteJobAfterMissingAliases
   issue: https://github.com/elastic/elasticsearch/issues/112823
-- class: org.elasticsearch.datastreams.logsdb.qa.StandardVersusLogsIndexModeRandomDataChallengeRestIT
-  method: testDateHistogramAggregation
-  issue: https://github.com/elastic/elasticsearch/issues/112919
-- class: org.elasticsearch.datastreams.logsdb.qa.StandardVersusLogsIndexModeRandomDataChallengeRestIT
-  method: testTermsQuery
-  issue: https://github.com/elastic/elasticsearch/issues/112462
 - class: org.elasticsearch.repositories.blobstore.testkit.analyze.HdfsRepositoryAnalysisRestIT
   issue: https://github.com/elastic/elasticsearch/issues/112889
 - class: org.elasticsearch.xpack.ml.integration.MlJobIT
@@ -238,6 +229,9 @@ tests:
 - class: org.elasticsearch.packaging.test.WindowsServiceTests
   method: test30StartStop
   issue: https://github.com/elastic/elasticsearch/issues/113160
+- class: org.elasticsearch.backwards.MixedClusterClientYamlTestSuiteIT
+  method: test {p0=indices.create/21_synthetic_source_stored/index param - nested array within array}
+  issue: https://github.com/elastic/elasticsearch/issues/113173
 
 # Examples:
 #

diff --git a/...spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml b/...spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml
@@ -553,6 +553,7 @@ object array in object with dynamic override:
         body:
           - '{ "create": { } }'
           - '{ "path_no": [ { "some_int": 10 }, {"name": "foo"} ], "path_runtime": [ { "some_int": 20 }, {"name": "bar"} ], "name": "baz" }'
+  - match: { errors: false }
 
   - do:
       search:

diff --git a/...c/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml b/...c/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml
@@ -301,6 +301,52 @@ object param - nested object with stored array:
   - match: { hits.hits.1._source.nested_array_stored.1.b.1.c: 200 }
 
 
+---
+index param - nested array within array:
+  - requires:
+      cluster_features: ["mapper.synthetic_source_keep"]
+      reason: requires tracking ignored source
+
+  - do:
+      indices.create:
+        index: test
+        body:
+          mappings:
+            _source:
+              mode: synthetic
+            properties:
+              name:
+                type: keyword
+              path:
+                properties:
+                  to:
+                    properties:
+                      some:
+                        store_array_source: true
+                        properties:
+                          id:
+                            type: integer
+
+  - do:
+      bulk:
+        index: test
+        refresh: true
+        body:
+          - '{ "create": { } }'
+          - '{ "name": "A", "path": [ { "to": [ { "some" : [ { "id": 10 }, { "id": [1, 3, 2] } ] }, { "some": { "id": 100 } } ] }, { "to": { "some": { "id": [1000, 2000] } } } ] }'
+  - match: { errors: false }
+
+  - do:
+      search:
+        index: test
+        sort: name
+  - match:  { hits.hits.0._source.name: A }
+  - match: { hits.hits.0._source.path.to.some.0.id: 10 }
+  - match: { hits.hits.0._source.path.to.some.1.id: [ 1, 3, 2] }
+  - match: { hits.hits.0._source.path.to.some.2.id: 100 }
+  - match: { hits.hits.0._source.path.to.some.3.id: [ 1000, 2000 ] }
+
+
 ---
 # 112156
 stored field under object with store_array_source:

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java
@@ -16,7 +16,6 @@
 import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.common.xcontent.XContentHelper;
 import org.elasticsearch.core.Nullable;
-import org.elasticsearch.core.Tuple;
 import org.elasticsearch.index.IndexVersion;
 import org.elasticsearch.index.IndexVersions;
 import org.elasticsearch.index.fielddata.FieldDataContext;
@@ -36,13 +35,16 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
+import java.util.Set;
 import java.util.function.Consumer;
 
 import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.MAX_DIMS_COUNT;
@@ -118,8 +120,7 @@ public String documentDescription() {
         };
     }
 
-    private static void internalParseDocument(MetadataFieldMapper[] metadataFieldsMappers, DocumentParserContext context) {
-
+    private void internalParseDocument(MetadataFieldMapper[] metadataFieldsMappers, DocumentParserContext context) {
         try {
             final boolean emptyDoc = isEmptyDoc(context.root(), context.parser());
 
@@ -147,6 +148,9 @@ private static void internalParseDocument(MetadataFieldMapper[] metadataFieldsMa
 
             executeIndexTimeScripts(context);
 
+            // Record additional entries for {@link IgnoredSourceFieldMapper} before calling #postParse, so that they get stored.
+            addIgnoredSourceMissingValues(context);
+
             for (MetadataFieldMapper metadataMapper : metadataFieldsMappers) {
                 metadataMapper.postParse(context);
             }
@@ -155,6 +159,122 @@ private static void internalParseDocument(MetadataFieldMapper[] metadataFieldsMa
         }
     }
 
+    private void addIgnoredSourceMissingValues(DocumentParserContext context) throws IOException {
+        Collection<IgnoredSourceFieldMapper.NameValue> ignoredFieldsMissingValues = context.getIgnoredFieldsMissingValues();
+        if (ignoredFieldsMissingValues.isEmpty()) {
+            return;
+        }
+
+        assert context.mappingLookup().isSourceSynthetic();
+        try (
+            XContentParser parser = XContentHelper.createParser(
+                parserConfiguration,
+                context.sourceToParse().source(),
+                context.sourceToParse().getXContentType()
+            )
+        ) {
+            DocumentParserContext newContext = new RootDocumentParserContext(
+                context.mappingLookup(),
+                mappingParserContext,
+                context.sourceToParse(),
+                parser
+            );
+            var nameValues = parseDocForMissingValues(newContext, ignoredFieldsMissingValues);
+            for (var nameValue : nameValues) {
+                context.addIgnoredField(nameValue);
+            }
+        }
+    }
+
+    /**
+     * Simplified parsing version for retrieving the source of a given set of fields.
+     */
+    private static List<IgnoredSourceFieldMapper.NameValue> parseDocForMissingValues(
+        DocumentParserContext context,
+        Collection<IgnoredSourceFieldMapper.NameValue> ignoredFieldsMissingValues
+    ) throws IOException {
+        // Maps full name to the corresponding NameValue entry.
+        Map<String, IgnoredSourceFieldMapper.NameValue> fields = new HashMap<>(ignoredFieldsMissingValues.size());
+        for (var field : ignoredFieldsMissingValues) {
+            fields.put(field.name(), field);
+        }
+
+        // Generate all possible parent names for the given fields.
+        // This is used to skip processing objects that can't generate missing values.
+        Set<String> parentNames = getPossibleParentNames(fields.keySet());
+        List<IgnoredSourceFieldMapper.NameValue> result = new ArrayList<>();
+
+        XContentParser parser = context.parser();
+        XContentParser.Token currentToken = parser.nextToken();
+        List<String> path = new ArrayList<>();
+        String fieldName = null;
+        while (currentToken != null) {
+            while (currentToken != XContentParser.Token.FIELD_NAME) {
+                if (fieldName != null
+                    && (currentToken == XContentParser.Token.START_OBJECT || currentToken == XContentParser.Token.START_ARRAY)) {
+                    if (parentNames.contains(getCurrentPath(path, fieldName)) == false) {
+                        // No missing value under this parsing subtree, skip it.
+                        parser.skipChildren();
+                    } else {
+                        path.add(fieldName);
+                    }
+                    fieldName = null;
+                } else if (currentToken == XContentParser.Token.END_OBJECT || currentToken == XContentParser.Token.END_ARRAY) {
+                    if (currentToken == XContentParser.Token.END_OBJECT && path.isEmpty() == false) {
+                        path.removeLast();
+                    }
+                    fieldName = null;
+                }
+                currentToken = parser.nextToken();
+                if (currentToken == null) {
+                    return result;
+                }
+            }
+            fieldName = parser.currentName();
+            String fullName = getCurrentPath(path, fieldName);
+            var leaf = fields.get(fullName);  // There may be multiple matches for array elements, don't use #remove.
+            if (leaf != null) {
+                parser.nextToken();  // Advance the parser to the value to be read.
+                result.add(leaf.cloneWithValue(XContentDataHelper.encodeToken(parser)));
+                parser.nextToken();  // Skip the token ending the value.
+                fieldName = null;
+            }
+            currentToken = parser.nextToken();
+        }
+        return result;
+    }
+
+    private static String getCurrentPath(List<String> path, String fieldName) {
+        assert fieldName != null;
+        return path.isEmpty() ? fieldName : String.join(".", path) + "." + fieldName;
+    }
+
+    /**
+     * Generates all possible parent object names for the given full names.
+     * For instance, for input ['path.to.foo', 'another.path.to.bar'], it returns:
+     * [ 'path', 'path.to', 'another', 'another.path', 'another.path.to' ]
+     */
+    private static Set<String> getPossibleParentNames(Set<String> fullPaths) {
+        if (fullPaths.isEmpty()) {
+            return Collections.emptySet();
+        }
+        Set<String> paths = new HashSet<>();
+        for (String fullPath : fullPaths) {
+            String[] split = fullPath.split("\\.");
+            if (split.length < 2) {
+                continue;
+            }
+            StringBuilder builder = new StringBuilder(split[0]);
+            paths.add(builder.toString());
+            for (int i = 1; i < split.length - 1; i++) {
+                builder.append(".");
+                builder.append(split[i]);
+                paths.add(builder.toString());
+            }
+        }
+        return paths;
+    }
+
     private static void executeIndexTimeScripts(DocumentParserContext context) {
         List<FieldMapper> indexTimeScriptMappers = context.mappingLookup().indexTimeScriptMappers();
         if (indexTimeScriptMappers.isEmpty()) {
@@ -299,16 +419,14 @@ static void parseObjectOrNested(DocumentParserContext context) throws IOExceptio
         if (context.parent().isNested()) {
             // Handle a nested object that doesn't contain an array. Arrays are handled in #parseNonDynamicArray.
             if (context.parent().storeArraySource() && context.canAddIgnoredField()) {
-                Tuple<DocumentParserContext, XContentBuilder> tuple = XContentDataHelper.cloneSubContext(context);
-                context.addIgnoredField(
+                context = context.addIgnoredFieldFromContext(
                     new IgnoredSourceFieldMapper.NameValue(
                         context.parent().fullPath(),
                         context.parent().fullPath().lastIndexOf(context.parent().leafName()),
-                        XContentDataHelper.encodeXContentBuilder(tuple.v2()),
+                        null,
                         context.doc()
                     )
                 );
-                context = tuple.v1();
                 token = context.parser().currentToken();
                 parser = context.parser();
             }
@@ -445,17 +563,10 @@ static void parseObjectOrField(DocumentParserContext context, Mapper mapper) thr
                 if (context.canAddIgnoredField()
                     && (fieldMapper.syntheticSourceMode() == FieldMapper.SyntheticSourceMode.FALLBACK
                         || (context.isWithinCopyTo() == false && context.isCopyToDestinationField(mapper.fullPath())))) {
-                    Tuple<DocumentParserContext, XContentBuilder> contextWithSourceToStore = XContentDataHelper.cloneSubContext(context);
-
-                    context.addIgnoredField(
-                        IgnoredSourceFieldMapper.NameValue.fromContext(
-                            context,
-                            fieldMapper.fullPath(),
-                            XContentDataHelper.encodeXContentBuilder(contextWithSourceToStore.v2())
-                        )
+                    context = context.addIgnoredFieldFromContext(
+                        IgnoredSourceFieldMapper.NameValue.fromContext(context, fieldMapper.fullPath(), null)
                     );
-
-                    fieldMapper.parse(contextWithSourceToStore.v1());
+                    fieldMapper.parse(context);
                 } else {
                     fieldMapper.parse(context);
                 }
@@ -553,16 +664,9 @@ private static void parseObjectDynamic(DocumentParserContext context, String cur
                 // hence we don't dynamically create empty objects under properties, but rather carry around an artificial object mapper
                 dynamicObjectMapper = new NoOpObjectMapper(currentFieldName, context.path().pathAsText(currentFieldName));
                 if (context.canAddIgnoredField()) {
-                    // Clone the DocumentParserContext to parse its subtree twice.
-                    Tuple<DocumentParserContext, XContentBuilder> tuple = XContentDataHelper.cloneSubContext(context);
-                    context.addIgnoredField(
-                        IgnoredSourceFieldMapper.NameValue.fromContext(
-                            context,
-                            context.path().pathAsText(currentFieldName),
-                            XContentDataHelper.encodeXContentBuilder(tuple.v2())
-                        )
+                    context = context.addIgnoredFieldFromContext(
+                        IgnoredSourceFieldMapper.NameValue.fromContext(context, context.path().pathAsText(currentFieldName), null)
                     );
-                    context = tuple.v1();
                 }
             } else {
                 dynamicObjectMapper = DynamicFieldsBuilder.createDynamicObjectMapper(context, currentFieldName);
@@ -686,10 +790,10 @@ private static void parseNonDynamicArray(
         final String lastFieldName,
         String arrayFieldName
     ) throws IOException {
+        String fullPath = context.path().pathAsText(arrayFieldName);
+
         // Check if we need to record the array source. This only applies to synthetic source.
         if (context.canAddIgnoredField()) {
-            String fullPath = context.path().pathAsText(arrayFieldName);
-
             boolean objectRequiresStoringSource = mapper instanceof ObjectMapper objectMapper
                 && (objectMapper.storeArraySource()
                     || (context.sourceKeepModeFromIndexSettings() == Mapper.SourceKeepMode.ARRAYS
@@ -706,11 +810,7 @@ private static void parseNonDynamicArray(
                 || dynamicRuntimeContext
                 || fieldWithStoredArraySource
                 || copyToFieldHasValuesInDocument) {
-                Tuple<DocumentParserContext, XContentBuilder> tuple = XContentDataHelper.cloneSubContext(context);
-                context.addIgnoredField(
-                    IgnoredSourceFieldMapper.NameValue.fromContext(context, fullPath, XContentDataHelper.encodeXContentBuilder(tuple.v2()))
-                );
-                context = tuple.v1();
+                context = context.addIgnoredFieldFromContext(IgnoredSourceFieldMapper.NameValue.fromContext(context, fullPath, null));
             } else if (mapper instanceof ObjectMapper objectMapper
                 && (objectMapper.isEnabled() == false || objectMapper.dynamic == ObjectMapper.Dynamic.FALSE)) {
                     context.addIgnoredField(
@@ -720,6 +820,11 @@ private static void parseNonDynamicArray(
                 }
         }
 
+        // In synthetic source, if any array element requires storing its source as-is, it takes precedence over
+        // elements from regular source loading that are then skipped from the synthesized array source.
+        // To prevent this, we track each array name, to check if it contains any sub-arrays in its elements.
+        context = context.cloneForArray(fullPath);
+
         XContentParser parser = context.parser();
         XContentParser.Token token;
         while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {