Enable numeric sort optimization support for all numeric types (#6424)

* Adding numeric optimization support for all numeric types Signed-off-by: gashutos <gashutos@amazon.com> * modifying CHANGELOG.md Signed-off-by: gashutos <gashutos@amazon.com> * Handling multi-cluster scenario where SortField serialization was failing Signed-off-by: gashutos <gashutos@amazon.com> * Fixing javadoc errors Signed-off-by: gashutos <gashutos@amazon.com> * Fixing nested sort integ tests Signed-off-by: gashutos <gashutos@amazon.com> * Stremlining behaviour of custom comparator tests too Signed-off-by: gashutos <gashutos@amazon.com> * Adding more integ tests for IntValuesComparatorSource & fixing few ITs Signed-off-by: gashutos <gashutos@amazon.com> * Fixing few more integ tests Signed-off-by: gashutos <gashutos@amazon.com> * Streamlining applySortWidening method with CreateSort and avoid modifying cteated objects of sort Signed-off-by: gashutos <gashutos@amazon.com> * Correcting licence header Signed-off-by: gashutos <gashutos@amazon.com> --------- Signed-off-by: gashutos <gashutos@amazon.com> Co-authored-by: Daniel (dB.) Doubrovkine <dblock@amazon.com>
opensearch-project · Mar 27, 2023 · e8425fc · e8425fc
1 parent 16797d6
commit e8425fc
Show file tree

Hide file tree

Showing 14 changed files with 703 additions and 172 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -87,6 +87,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - [Segment Replication] Apply backpressure when replicas fall behind ([#6563](https://github.com/opensearch-project/OpenSearch/pull/6563))
 - [Remote Store] Integrate remote segment store in peer recovery flow ([#6664](https://github.com/opensearch-project/OpenSearch/pull/6664))
 - [Segment Replication] Add new cluster setting to set replication strategy by default for all indices in cluster. ([#6791](https://github.com/opensearch-project/OpenSearch/pull/6791))
+- Enable sort optimization for all NumericTypes ([#6464](https://github.com/opensearch-project/OpenSearch/pull/6464)
 
 ### Dependencies
 - Bump `org.apache.logging.log4j:log4j-core` from 2.18.0 to 2.20.0 ([#6490](https://github.com/opensearch-project/OpenSearch/pull/6490))

diff --git a/server/src/internalClusterTest/java/org/opensearch/index/IndexSortIT.java b/server/src/internalClusterTest/java/org/opensearch/index/IndexSortIT.java
@@ -81,8 +81,8 @@ private static XContentBuilder createTestMapping() {
     public void testIndexSort() {
         SortField dateSort = new SortedNumericSortField("date", SortField.Type.LONG, false);
         dateSort.setMissingValue(Long.MAX_VALUE);
-        SortField numericSort = new SortedNumericSortField("numeric_dv", SortField.Type.LONG, false);
-        numericSort.setMissingValue(Long.MAX_VALUE);
+        SortField numericSort = new SortedNumericSortField("numeric_dv", SortField.Type.INT, false);
+        numericSort.setMissingValue(Integer.MAX_VALUE);
         SortField keywordSort = new SortedSetSortField("keyword_dv", false);
         keywordSort.setMissingValue(SortField.STRING_LAST);
         Sort indexSort = new Sort(dateSort, numericSort, keywordSort);

diff --git a/server/src/internalClusterTest/java/org/opensearch/search/searchafter/SearchAfterIT.java b/server/src/internalClusterTest/java/org/opensearch/search/searchafter/SearchAfterIT.java
@@ -382,24 +382,22 @@ private void createIndexMappingsFromObjectType(String indexName, List<Object> ty
         ensureGreen();
     }
 
-    // Convert Integer, Short, Byte and Boolean to Long in order to match the conversion done
+    // Convert Integer, Short, Byte and Boolean to Int in order to match the conversion done
     // by the internal hits when populating the sort values.
     private List<Object> convertSortValues(List<Object> sortValues) {
         List<Object> converted = new ArrayList<>();
         for (int i = 0; i < sortValues.size(); i++) {
             Object from = sortValues.get(i);
-            if (from instanceof Integer) {
-                converted.add(((Integer) from).longValue());
-            } else if (from instanceof Short) {
-                converted.add(((Short) from).longValue());
+            if (from instanceof Short) {
+                converted.add(((Short) from).intValue());
             } else if (from instanceof Byte) {
-                converted.add(((Byte) from).longValue());
+                converted.add(((Byte) from).intValue());
             } else if (from instanceof Boolean) {
                 boolean b = (boolean) from;
                 if (b) {
-                    converted.add(1L);
+                    converted.add(1);
                 } else {
-                    converted.add(0L);
+                    converted.add(0);
                 }
             } else {
                 converted.add(from);

diff --git a/server/src/main/java/org/opensearch/action/search/SearchPhaseController.java b/server/src/main/java/org/opensearch/action/search/SearchPhaseController.java
@@ -41,6 +41,7 @@
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.SortedNumericSortField;
 import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.TopFieldDocs;
@@ -68,6 +69,7 @@
 import org.opensearch.search.profile.ProfileShardResult;
 import org.opensearch.search.profile.SearchProfileShardResults;
 import org.opensearch.search.query.QuerySearchResult;
+import org.opensearch.search.sort.SortedWiderNumericSortField;
 import org.opensearch.search.suggest.Suggest;
 import org.opensearch.search.suggest.Suggest.Suggestion;
 import org.opensearch.search.suggest.completion.CompletionSuggestion;
@@ -235,14 +237,12 @@ static TopDocs mergeTopDocs(Collection<TopDocs> results, int topN, int from) {
         if (numShards == 1 && from == 0) { // only one shard and no pagination we can just return the topDocs as we got them.
             return topDocs;
         } else if (topDocs instanceof CollapseTopFieldDocs) {
-            CollapseTopFieldDocs firstTopDocs = (CollapseTopFieldDocs) topDocs;
-            final Sort sort = new Sort(firstTopDocs.fields);
             final CollapseTopFieldDocs[] shardTopDocs = results.toArray(new CollapseTopFieldDocs[numShards]);
+            final Sort sort = createSort(shardTopDocs);
             mergedTopDocs = CollapseTopFieldDocs.merge(sort, from, topN, shardTopDocs, false);
         } else if (topDocs instanceof TopFieldDocs) {
-            TopFieldDocs firstTopDocs = (TopFieldDocs) topDocs;
-            final Sort sort = new Sort(firstTopDocs.fields);
             final TopFieldDocs[] shardTopDocs = results.toArray(new TopFieldDocs[numShards]);
+            final Sort sort = createSort(shardTopDocs);
             mergedTopDocs = TopDocs.merge(sort, from, topN, shardTopDocs);
         } else {
             final TopDocs[] shardTopDocs = results.toArray(new TopDocs[numShards]);
@@ -600,6 +600,49 @@ private static void validateMergeSortValueFormats(Collection<? extends SearchPha
         }
     }
 
+    /**
+     * Creates Sort object from topFieldsDocs fields.
+     * It is necessary to widen the SortField.Type to maximum byte size for merging sorted docs.
+     * Different indices might have different types. This will avoid user to do re-index of data
+     * in case of mapping field change for newly indexed data.
+     * This will support Int to Long and Float to Double.
+     * Earlier widening of type was taken care in IndexNumericFieldData, but since we now want to
+     * support sort optimization, we removed type widening there and taking care here during merging.
+     * More details here https://github.com/opensearch-project/OpenSearch/issues/6326
+     */
+    private static Sort createSort(TopFieldDocs[] topFieldDocs) {
+        final SortField[] firstTopDocFields = topFieldDocs[0].fields;
+        final SortField[] newFields = new SortField[firstTopDocFields.length];
+
+        for (int i = 0; i < firstTopDocFields.length; i++) {
+            if (firstTopDocFields[i] instanceof SortedNumericSortField && isSortWideningRequired(topFieldDocs, i)) {
+                final SortedNumericSortField delegate = (SortedNumericSortField) firstTopDocFields[i];
+                newFields[i] = new SortedWiderNumericSortField(
+                    delegate.getField(),
+                    delegate.getNumericType(),
+                    delegate.getReverse(),
+                    delegate.getSelector()
+                );
+            } else {
+                newFields[i] = firstTopDocFields[i];
+            }
+        }
+        return new Sort(newFields);
+    }
+
+    /**
+     * It will compare respective SortField between shards to see if any shard results have different
+     * field mapping type, accordingly it will decide to widen the sort fields.
+     */
+    private static boolean isSortWideningRequired(TopFieldDocs[] topFieldDocs, int sortFieldindex) {
+        for (int i = 0; i < topFieldDocs.length - 1; i++) {
+            if (topFieldDocs[i].fields[sortFieldindex] != topFieldDocs[i + 1].fields[sortFieldindex]) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     /*
      * Returns the size of the requested top documents (from + size)
      */

diff --git a/server/src/main/java/org/opensearch/common/lucene/Lucene.java b/server/src/main/java/org/opensearch/common/lucene/Lucene.java
@@ -93,6 +93,7 @@
 import org.opensearch.index.analysis.AnalyzerScope;
 import org.opensearch.index.analysis.NamedAnalyzer;
 import org.opensearch.index.fielddata.IndexFieldData;
+import org.opensearch.search.sort.SortedWiderNumericSortField;
 
 import java.io.IOException;
 import java.math.BigInteger;
@@ -599,8 +600,8 @@ public static void writeSortField(StreamOutput out, SortField sortField) throws
             SortField newSortField = new SortField(sortField.getField(), SortField.Type.STRING, sortField.getReverse());
             newSortField.setMissingValue(sortField.getMissingValue());
             sortField = newSortField;
-        } else if (sortField.getClass() == SortedNumericSortField.class) {
-            // for multi-valued sort field, we replace the SortedSetSortField with a simple SortField.
+        } else if (sortField.getClass() == SortedNumericSortField.class || sortField.getClass() == SortedWiderNumericSortField.class) {
+            // for multi-valued sort field, we replace the SortedNumericSortField/SortedWiderNumericSortField with a simple SortField.
             // It works because the sort field is only used to merge results from different shards.
             SortField newSortField = new SortField(
                 sortField.getField(),

diff --git a/server/src/main/java/org/opensearch/index/fielddata/IndexNumericFieldData.java b/server/src/main/java/org/opensearch/index/fielddata/IndexNumericFieldData.java
@@ -42,6 +42,7 @@
 import org.opensearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
 import org.opensearch.index.fielddata.fieldcomparator.DoubleValuesComparatorSource;
 import org.opensearch.index.fielddata.fieldcomparator.FloatValuesComparatorSource;
+import org.opensearch.index.fielddata.fieldcomparator.IntValuesComparatorSource;
 import org.opensearch.index.fielddata.fieldcomparator.LongValuesComparatorSource;
 import org.opensearch.search.DocValueFormat;
 import org.opensearch.search.MultiValueMode;
@@ -65,76 +66,16 @@ public abstract class IndexNumericFieldData implements IndexFieldData<LeafNumeri
      * @opensearch.internal
      */
     public enum NumericType {
-        BOOLEAN(false, SortField.Type.LONG, CoreValuesSourceType.BOOLEAN) {
-            @Deprecated
-            @Override
-            protected PointSortOptimization applyPointSortOptimization() {
-                return PointSortOptimization.DISABLED;
-            }
-        },
-        BYTE(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
-            @Deprecated
-            @Override
-            protected PointSortOptimization applyPointSortOptimization() {
-                return PointSortOptimization.DISABLED;
-            }
-        },
-        SHORT(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
-            @Deprecated
-            @Override
-            protected PointSortOptimization applyPointSortOptimization() {
-                return PointSortOptimization.DISABLED;
-            }
-        },
-        INT(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
-            @Deprecated
-            @Override
-            protected PointSortOptimization applyPointSortOptimization() {
-                return PointSortOptimization.DISABLED;
-            }
-        },
-        LONG(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
-            @Deprecated
-            @Override
-            protected PointSortOptimization applyPointSortOptimization() {
-                return PointSortOptimization.ENABLED;
-            }
-        },
-        DATE(false, SortField.Type.LONG, CoreValuesSourceType.DATE) {
-            @Deprecated
-            @Override
-            protected PointSortOptimization applyPointSortOptimization() {
-                return PointSortOptimization.ENABLED;
-            }
-        },
-        DATE_NANOSECONDS(false, SortField.Type.LONG, CoreValuesSourceType.DATE) {
-            @Deprecated
-            @Override
-            public PointSortOptimization applyPointSortOptimization() {
-                return PointSortOptimization.ENABLED;
-            }
-        },
-        HALF_FLOAT(true, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
-            @Deprecated
-            @Override
-            protected PointSortOptimization applyPointSortOptimization() {
-                return PointSortOptimization.DISABLED;
-            }
-        },
-        FLOAT(true, SortField.Type.FLOAT, CoreValuesSourceType.NUMERIC) {
-            @Deprecated
-            @Override
-            protected PointSortOptimization applyPointSortOptimization() {
-                return PointSortOptimization.DISABLED;
-            }
-        },
-        DOUBLE(true, SortField.Type.DOUBLE, CoreValuesSourceType.NUMERIC) {
-            @Deprecated
-            @Override
-            protected PointSortOptimization applyPointSortOptimization() {
-                return PointSortOptimization.ENABLED;
-            }
-        };
+        BOOLEAN(false, SortField.Type.INT, CoreValuesSourceType.BOOLEAN),
+        BYTE(false, SortField.Type.INT, CoreValuesSourceType.NUMERIC),
+        SHORT(false, SortField.Type.INT, CoreValuesSourceType.NUMERIC),
+        INT(false, SortField.Type.INT, CoreValuesSourceType.NUMERIC),
+        LONG(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC),
+        DATE(false, SortField.Type.LONG, CoreValuesSourceType.DATE),
+        DATE_NANOSECONDS(false, SortField.Type.LONG, CoreValuesSourceType.DATE),
+        HALF_FLOAT(true, SortField.Type.LONG, CoreValuesSourceType.NUMERIC),
+        FLOAT(true, SortField.Type.FLOAT, CoreValuesSourceType.NUMERIC),
+        DOUBLE(true, SortField.Type.DOUBLE, CoreValuesSourceType.NUMERIC);
 
         private final boolean floatingPoint;
         private final ValuesSourceType valuesSourceType;
@@ -153,24 +94,6 @@ public final boolean isFloatingPoint() {
         public final ValuesSourceType getValuesSourceType() {
             return valuesSourceType;
         }
-
-        @Deprecated
-        protected abstract PointSortOptimization applyPointSortOptimization();
-    }
-
-    /**
-     * Controls whether to apply sort optimization to skip non-competitive docs
-     * based on the BKD index.
-     *
-     * @deprecated this control will be removed in a future version of OpenSearch
-     *
-     * @opensearch.internal
-     * @opensearch.experimental
-     */
-    @Deprecated
-    private enum PointSortOptimization {
-        ENABLED,
-        DISABLED
     }
 
     /**
@@ -211,21 +134,6 @@ public final SortField sortField(
             : SortedNumericSelector.Type.MIN;
         SortField sortField = new SortedNumericSortField(getFieldName(), getNumericType().sortFieldType, reverse, selectorType);
         sortField.setMissingValue(source.missingObject(missingValue, reverse));
-
-        // LUCENE-9280 added the ability for collectors to skip non-competitive
-        // documents when top docs are sorted by other fields different from the _score.
-        // However, from Lucene 9 onwards, numeric sort optimisation requires the byte size
-        // for points (BKD index) and doc values (columnar) and SortField.Type to be matched.
-        // NumericType violates this requirement
-        // (see: https://github.com/opensearch-project/OpenSearch/issues/2063#issuecomment-1069358826 test failure)
-        // because it uses the largest byte size (LONG) for the SortField of most types. The section below disables
-        // the BKD based sort optimization for numeric types whose encoded BYTE size does not match the comparator (LONG)/
-        // So as of now, we can only enable for DATE, DATE_NANOSECONDS, LONG, DOUBLE.
-        // BOOLEAN, BYTE, SHORT, INT, HALF_FLOAT, FLOAT (use long for doc values, but fewer for BKD Points)
-        // todo : Enable other SortField.Type as well, that will require wider change
-        if (getNumericType().applyPointSortOptimization() == PointSortOptimization.DISABLED) {
-            sortField.setOptimizeSortWithPoints(false);
-        }
         return sortField;
     }
 
@@ -298,9 +206,11 @@ private XFieldComparatorSource comparatorSource(
                 return dateComparatorSource(missingValue, sortMode, nested);
             case DATE_NANOSECONDS:
                 return dateNanosComparatorSource(missingValue, sortMode, nested);
+            case LONG:
+                return new LongValuesComparatorSource(this, missingValue, sortMode, nested);
             default:
                 assert !targetNumericType.isFloatingPoint();
-                return new LongValuesComparatorSource(this, missingValue, sortMode, nested);
+                return new IntValuesComparatorSource(this, missingValue, sortMode, nested);
         }
     }