Rebase with main

Signed-off-by: Varun Jain <varunudr@amazon.com>
opensearch-project · Oct 23, 2024 · 4d3e3d3 · 4d3e3d3
2 parents 152aa4c + 0df59c6
commit 4d3e3d3
Show file tree

Hide file tree

Showing 39 changed files with 3,260 additions and 89 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,2 +1,2 @@
 # This should match the owning team set up in https://github.com/orgs/opensearch-project/teams
-*   @heemin32 @navneet1v @VijayanB @vamshin @jmazanec15 @naveentatikonda @junqiu-lei @martin-gaievski @sean-zheng-amazon @model-collapse @zane-neo @ylwu-amzn @jngz-es @vibrantvarun @zhichao-aws
+*   @heemin32 @navneet1v @VijayanB @vamshin @jmazanec15 @naveentatikonda @junqiu-lei @martin-gaievski @sean-zheng-amazon @model-collapse @zane-neo @ylwu-amzn @jngz-es @vibrantvarun @zhichao-aws @yuye-aws
diff --git a/.github/workflows/backwards_compatibility_tests_workflow.yml b/.github/workflows/backwards_compatibility_tests_workflow.yml
@@ -15,7 +15,7 @@ jobs:
       matrix:
         java: [ 21 ]
         os: [ubuntu-latest,windows-latest]
-        bwc_version : ["2.9.0","2.10.0","2.11.0","2.12.0","2.13.0","2.14.0","2.15.0","2.16.0","2.17.0-SNAPSHOT"]
+        bwc_version : ["2.9.0","2.10.0","2.11.0","2.12.0","2.13.0","2.14.0","2.15.0","2.16.0","2.17.0","2.18.0-SNAPSHOT"]
         opensearch_version : [ "3.0.0-SNAPSHOT" ]
 
     name: NeuralSearch Restart-Upgrade BWC Tests
@@ -42,7 +42,7 @@ jobs:
       matrix:
         java: [ 21 ]
         os: [ubuntu-latest,windows-latest]
-        bwc_version: [ "2.17.0-SNAPSHOT" ]
+        bwc_version: [ "2.18.0-SNAPSHOT" ]
         opensearch_version: [ "3.0.0-SNAPSHOT" ]
 
     name: NeuralSearch Rolling-Upgrade BWC Tests

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,12 +8,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Enhancements
 - Set neural-search plugin 3.0.0 baseline JDK version to JDK-2 ([#838](https://github.com/opensearch-project/neural-search/pull/838))
 ### Bug Fixes
+- Fix for nested field missing sub embedding field in text embedding processor ([#913](https://github.com/opensearch-project/neural-search/pull/913))
 ### Infrastructure
 ### Documentation
 ### Maintenance
 ### Refactoring
 
-## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.17...2.x)
+## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.18...2.x)
 ### Features
 - Pagination in Hybrid query ([]())
 ### Enhancements

diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
@@ -2,7 +2,7 @@
   - [Getting Started](#getting-started)
     - [Fork OpenSearch neural-search Repo](#fork-opensearch-neural-search-repo)
     - [Install Prerequisites](#install-prerequisites)
-      - [JDK 11](#jdk-11)
+      - [JDK 21](#jdk-21)
       - [Environment](#Environment)
   - [Use an Editor](#use-an-editor)
     - [IntelliJ IDEA](#intellij-idea)
@@ -11,10 +11,12 @@
     - [Run Single-node Cluster Locally](#run-single-node-cluster-locally)
     - [Run Multi-node Cluster Locally](#run-multi-node-cluster-locally)
   - [Debugging](#debugging)
+    - [Major Dependencies](#major-dependencies)
   - [Backwards Compatibility Testing](#backwards-compatibility-testing)
     - [Adding new tests](#adding-new-tests)
   - [Supported configurations](#supported-configurations)
   - [Submitting Changes](#submitting-changes)
+  - [Building On Lucene Version Updates](#building-on-lucene-version-updates)
 
 # Developer Guide
 
@@ -33,18 +35,18 @@ git clone https://github.com/[your username]/neural-search.git
 
 ### Install Prerequisites
 
-#### JDK 11
+#### JDK 21
 
-OpenSearch builds using Java 11 at a minimum. This means you must have a JDK 11 installed with the environment variable
-`JAVA_HOME` referencing the path to Java home for your JDK 11 installation, e.g. `JAVA_HOME=/usr/lib/jvm/jdk-11`.
+OpenSearch builds using Java 21 at a minimum. This means you must have a JDK 21 installed with the environment variable
+`JAVA_HOME` referencing the path to Java home for your JDK 21 installation, e.g. `JAVA_HOME=/usr/lib/jvm/jdk-21`.
 
-One easy way to get Java 11 on *nix is to use [sdkman](https://sdkman.io/).
+One easy way to get Java 21 on *nix is to use [sdkman](https://sdkman.io/).
 
 ```bash
 curl -s "https://get.sdkman.io" | bash
 source ~/.sdkman/bin/sdkman-init.sh
-sdk install java 11.0.2-open
-sdk use java 11.0.2-open
+sdk install java 21.0.2-open
+sdk use java 21.0.2-open
 ```
 
 JDK versions 14 and 17 were tested and are fully supported for local development.
@@ -53,7 +55,7 @@ JDK versions 14 and 17 were tested and are fully supported for local development
 
 ### IntelliJ IDEA
 
-When importing into IntelliJ you will need to define an appropriate JDK. The convention is that **this SDK should be named "11"**, and the project import will detect it automatically. For more details on defining an SDK in IntelliJ please refer to [this documentation](https://www.jetbrains.com/help/idea/sdk.html#define-sdk). Note that SDK definitions are global, so you can add the JDK from any project, or after project import. Importing with a missing JDK will still work, IntelliJ will report a problem and will refuse to build until resolved.
+When importing into IntelliJ you will need to define an appropriate JDK. The convention is that **this SDK should be named "21"**, and the project import will detect it automatically. For more details on defining an SDK in IntelliJ please refer to [this documentation](https://www.jetbrains.com/help/idea/sdk.html#define-sdk). Note that SDK definitions are global, so you can add the JDK from any project, or after project import. Importing with a missing JDK will still work, IntelliJ will report a problem and will refuse to build until resolved.
 
 You can import the OpenSearch project into IntelliJ IDEA as follows.
 
@@ -88,12 +90,22 @@ Please follow these formatting guidelines:
 OpenSearch neural-search uses a [Gradle](https://docs.gradle.org/6.6.1/userguide/userguide.html) wrapper for its build.
 Run `gradlew` on Unix systems.
 
-Build OpenSearch neural-search using `gradlew build`
+Build OpenSearch neural-search using `gradlew build`. This command will
+also run Integration Tests and Unit Tests.
 
 ```
 ./gradlew build
 ```
 
+## Run Unit Tests
+If you want to strictly test that your unit tests are passing
+you can run the following.
+
+```
+./gradlew test
+```
+
+
 ## Run OpenSearch neural-search
 
 ### Run Single-node Cluster Locally
@@ -227,6 +239,12 @@ Additionally, it is possible to attach one debugger to the cluster JVM and anoth
 ./gradlew :integTest -Dtest.debug=1 -Dcluster.debug=1
 ```
 
+#### Major Dependencies
+Currently, the major dependencies that Neural Search depends on are [ML-Commons](https://github.com/opensearch-project/ml-commons) and [K-NN](https://github.com/opensearch-project/k-NN).
+Make sure to check on them when you observe a failure that affects Neural Search.
+See [Building on Lucene Version updates](#building-on-lucene-version-updates) as an example where K-NN caused a build failure.
+Also, please note that it may take time for developers to create a fix for your current dependency issue.
+
 ## Backwards Compatibility Testing
 
 The purpose of Backwards Compatibility Testing and different types of BWC tests are explained [here](https://github.com/opensearch-project/opensearch-plugins/blob/main/TESTING.md#backwards-compatibility-testing). The BWC tests (i.e. Restart-Upgrade, Mixed-Cluster and Rolling-Upgrade scenarios) should be added with any new feature being added to Neural Search.
@@ -292,3 +310,31 @@ original PR with an appropriate label `backport <backport-branch-name>` is merge
 run successfully on the PR. For example, if a PR on main needs to be backported to `2.x` branch, add a label
 `backport 2.x` to the PR and make sure the backport workflow runs on the PR along with other checks. Once this PR is
 merged to main, the workflow will create a backport PR to the `2.x` branch.
+
+## Building On Lucene Version Updates
+There may be a Lucene version update that can affect your workflow causing errors like
+`java.lang.NoClassDefFoundError: org/apache/lucene/codecs/lucene99/Lucene99Codec` or
+`Provider org.opensearch.knn.index.codec.KNN910Codec.KNN910Codec could not be instantiated`. In this case
+we can observe there may be an issue with a dependency with [K-NN](https://github.com/opensearch-project/k-NN).
+This results in having issues with not being able to do `./gradlew run` or `./gradlew build`.
+
+You can check this [K-NN PR](https://github.com/opensearch-project/k-NN/pull/2195) as an example of this event happening or this [Neural Search PR](https://github.com/opensearch-project/neural-search/pull/913#issuecomment-2400189329) that shows a developer going
+through the same build issue.
+
+**Follow the steps to remedy the gradle run issue.**
+1. From your cloned neural search repo root directory `rm -rf build .gradle`
+2. Clear the following directories from your gradle folder located in your root directory
+   1. `cd ~/.gradle`
+   2. `rm -rf caches workers wrapper daemon`
+   3. `cd -` switch back the previous directory (i.e. the neural search repo root directory)
+3. Finally run `./gradlew run`
+
+**Follow the steps to remedy the gradle build issue**
+
+**PREREQ:** Make sure you have OpenSearch repo cloned locally
+
+1. From your cloned neural search repo root directory `rm -rf build .gradle`
+2. Delete the .gradle folder and .m2 folder. `rm -rf ~/.gradle ~/.m2`
+3. Head over to your OpenSearch cloned repo root directory
+   1. `./gradlew publisToMavenLocal`
+4. Finally run `./gradlew build` from the neural search repo
diff --git a/MAINTAINERS.md b/MAINTAINERS.md
@@ -21,6 +21,7 @@ This document contains a list of maintainers in this repo. See [opensearch-proje
 | Vijayan Balasubramanian | [VijayanB](https://github.com/VijayanB)                   | Amazon      |
 | Varun Jain              | [vibrantvarun](https://github.com/vibrantvarun)           | Amazon      |
 | Zhichao Geng            | [zhichao-aws](https://github.com/zhichao-aws)             | Amazon      |
+| Yuye Zhu                | [yuye-aws](https://github.com/yuye-aws)                   | Amazon      |
 
 ## Emeritus
 

diff --git a/gradle.properties b/gradle.properties
@@ -7,8 +7,8 @@
 # https://github.com/opensearch-project/OpenSearch/blob/main/libs/core/src/main/java/org/opensearch/Version.java .
 # Wired compatibility of OpenSearch works like 3.x version is compatible with 2.(latest-major) version.
 # Therefore, to run rolling-upgrade BWC Test on local machine the BWC version here should be set 2.(latest-major).
-systemProp.bwc.version=2.17.0-SNAPSHOT
-systemProp.bwc.bundle.version=2.17.0
+systemProp.bwc.version=2.18.0-SNAPSHOT
+systemProp.bwc.bundle.version=2.18.0
 
 # For fixing Spotless check with Java 17
 org.gradle.jvmargs=--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED \

diff --git a/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/HybridSearchIT.java b/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/HybridSearchIT.java
@@ -9,6 +9,8 @@
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
+
 import org.opensearch.index.query.MatchQueryBuilder;
 import static org.opensearch.neuralsearch.util.TestUtils.NODES_BWC_CLUSTER;
 import static org.opensearch.neuralsearch.util.TestUtils.PARAM_NAME_WEIGHTS;
@@ -17,6 +19,8 @@
 import static org.opensearch.neuralsearch.util.TestUtils.DEFAULT_COMBINATION_METHOD;
 import static org.opensearch.neuralsearch.util.TestUtils.getModelId;
 
+import org.opensearch.index.query.QueryBuilder;
+import org.opensearch.index.query.QueryBuilders;
 import org.opensearch.knn.index.query.rescore.RescoreContext;
 import org.opensearch.neuralsearch.query.HybridQueryBuilder;
 import org.opensearch.neuralsearch.query.NeuralQueryBuilder;
@@ -31,6 +35,8 @@ public class HybridSearchIT extends AbstractRollingUpgradeTestCase {
     private static final String TEXT_UPGRADED = "Hi earth";
     private static final String QUERY = "Hi world";
     private static final int NUM_DOCS_PER_ROUND = 1;
+    private static final String VECTOR_EMBEDDING_FIELD = "passage_embedding";
+    protected static final String RESCORE_QUERY = "hi";
     private static String modelId = "";
 
     // Test rolling-upgrade normalization processor when index with multiple shards
@@ -62,12 +68,13 @@ public void testNormalizationProcessor_whenIndexWithMultipleShards_E2EFlow() thr
                 if (isFirstMixedRound()) {
                     totalDocsCountMixed = NUM_DOCS_PER_ROUND;
                     HybridQueryBuilder hybridQueryBuilder = getQueryBuilder(modelId, null, null);
-                    validateTestIndexOnUpgrade(totalDocsCountMixed, modelId, hybridQueryBuilder);
+                    QueryBuilder rescorer = QueryBuilders.matchQuery(TEST_FIELD, RESCORE_QUERY).boost(0.3f);
+                    validateTestIndexOnUpgrade(totalDocsCountMixed, modelId, hybridQueryBuilder, rescorer);
                     addDocument(getIndexNameForTest(), "1", TEST_FIELD, TEXT_MIXED, null, null);
                 } else {
                     totalDocsCountMixed = 2 * NUM_DOCS_PER_ROUND;
                     HybridQueryBuilder hybridQueryBuilder = getQueryBuilder(modelId, null, null);
-                    validateTestIndexOnUpgrade(totalDocsCountMixed, modelId, hybridQueryBuilder);
+                    validateTestIndexOnUpgrade(totalDocsCountMixed, modelId, hybridQueryBuilder, null);
                 }
                 break;
             case UPGRADED:
@@ -77,9 +84,10 @@ public void testNormalizationProcessor_whenIndexWithMultipleShards_E2EFlow() thr
                     loadModel(modelId);
                     addDocument(getIndexNameForTest(), "2", TEST_FIELD, TEXT_UPGRADED, null, null);
                     HybridQueryBuilder hybridQueryBuilder = getQueryBuilder(modelId, null, null);
-                    validateTestIndexOnUpgrade(totalDocsCountUpgraded, modelId, hybridQueryBuilder);
+                    QueryBuilder rescorer = QueryBuilders.matchQuery(TEST_FIELD, RESCORE_QUERY).boost(0.3f);
+                    validateTestIndexOnUpgrade(totalDocsCountUpgraded, modelId, hybridQueryBuilder, rescorer);
                     hybridQueryBuilder = getQueryBuilder(modelId, Map.of("ef_search", 100), RescoreContext.getDefault());
-                    validateTestIndexOnUpgrade(totalDocsCountUpgraded, modelId, hybridQueryBuilder);
+                    validateTestIndexOnUpgrade(totalDocsCountUpgraded, modelId, hybridQueryBuilder, rescorer);
                 } finally {
                     wipeOfTestResources(getIndexNameForTest(), PIPELINE_NAME, modelId, SEARCH_PIPELINE_NAME);
                 }
@@ -89,15 +97,19 @@ public void testNormalizationProcessor_whenIndexWithMultipleShards_E2EFlow() thr
         }
     }
 
-    private void validateTestIndexOnUpgrade(final int numberOfDocs, final String modelId, HybridQueryBuilder hybridQueryBuilder)
-        throws Exception {
+    private void validateTestIndexOnUpgrade(
+        final int numberOfDocs,
+        final String modelId,
+        HybridQueryBuilder hybridQueryBuilder,
+        QueryBuilder rescorer
+    ) throws Exception {
         int docCount = getDocCount(getIndexNameForTest());
         assertEquals(numberOfDocs, docCount);
         loadModel(modelId);
         Map<String, Object> searchResponseAsMap = search(
             getIndexNameForTest(),
             hybridQueryBuilder,
-            null,
+            rescorer,
             1,
             Map.of("search_pipeline", SEARCH_PIPELINE_NAME)
         );
@@ -113,18 +125,18 @@ private void validateTestIndexOnUpgrade(final int numberOfDocs, final String mod
     private HybridQueryBuilder getQueryBuilder(
         final String modelId,
         final Map<String, ?> methodParameters,
-        final RescoreContext rescoreContext
+        final RescoreContext rescoreContextForNeuralQuery
     ) {
         NeuralQueryBuilder neuralQueryBuilder = new NeuralQueryBuilder();
-        neuralQueryBuilder.fieldName("passage_embedding");
+        neuralQueryBuilder.fieldName(VECTOR_EMBEDDING_FIELD);
         neuralQueryBuilder.modelId(modelId);
         neuralQueryBuilder.queryText(QUERY);
         neuralQueryBuilder.k(5);
         if (methodParameters != null) {
             neuralQueryBuilder.methodParameters(methodParameters);
         }
-        if (rescoreContext != null) {
-            neuralQueryBuilder.rescoreContext(rescoreContext);
+        if (Objects.nonNull(rescoreContextForNeuralQuery)) {
+            neuralQueryBuilder.rescoreContext(rescoreContextForNeuralQuery);
         }
 
         MatchQueryBuilder matchQueryBuilder = new MatchQueryBuilder("text", QUERY);

diff --git a/release-notes/opensearch-neural-search.release-notes-2.18.0.0.md b/release-notes/opensearch-neural-search.release-notes-2.18.0.0.md
@@ -0,0 +1,13 @@
+
+## Version 2.18.0.0 Release Notes
+
+Compatible with OpenSearch 2.18.0
+
+### Features
+- Introduces ByFieldRerankProcessor for second level reranking on documents ([#932](https://github.com/opensearch-project/neural-search/pull/932))
+### Bug Fixes
+- Fixed incorrect document order for nested aggregations in hybrid query ([#956](https://github.com/opensearch-project/neural-search/pull/956))
+### Enhancements
+- Implement `ignore_missing` field in text chunking processors ([#907](https://github.com/opensearch-project/neural-search/pull/907))
+- Added rescorer in hybrid query ([#917](https://github.com/opensearch-project/neural-search/pull/917))
+
diff --git a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java
@@ -10,6 +10,7 @@
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@@ -285,7 +286,7 @@ private void createInferenceListForMapTypeInput(Object sourceValue, List<String>
         if (sourceValue instanceof Map) {
             ((Map<String, Object>) sourceValue).forEach((k, v) -> createInferenceListForMapTypeInput(v, texts));
         } else if (sourceValue instanceof List) {
-            texts.addAll(((List<String>) sourceValue));
+            ((List<String>) sourceValue).stream().filter(Objects::nonNull).forEach(texts::add);
         } else {
             if (sourceValue == null) return;
             texts.add(sourceValue.toString());
@@ -419,8 +420,12 @@ private void putNLPResultToSourceMapForMapType(
             for (Map.Entry<String, Object> inputNestedMapEntry : ((Map<String, Object>) sourceValue).entrySet()) {
                 if (sourceAndMetadataMap.get(processorKey) instanceof List) {
                     // build nlp output for list of nested objects
+                    Iterator<Object> inputNestedMapValueIt = ((List<Object>) inputNestedMapEntry.getValue()).iterator();
                     for (Map<String, Object> nestedElement : (List<Map<String, Object>>) sourceAndMetadataMap.get(processorKey)) {
-                        nestedElement.put(inputNestedMapEntry.getKey(), results.get(indexWrapper.index++));
+                        // Only fill in when value is not null
+                        if (inputNestedMapValueIt.hasNext() && inputNestedMapValueIt.next() != null) {
+                            nestedElement.put(inputNestedMapEntry.getKey(), results.get(indexWrapper.index++));
+                        }
                     }
                 } else {
                     Pair<String, Object> processedNestedKey = processNestedKey(inputNestedMapEntry);