apache · gf2121 · Oct 10, 2023 · Oct 7, 2023 · Oct 7, 2023 · Oct 7, 2023
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java
@@ -16,12 +16,15 @@
  */
 package org.apache.lucene.codecs.lucene90.blocktree;
 
+import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT;
+
 import java.io.IOException;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
@@ -82,7 +85,7 @@ public final class FieldReader extends Terms {
     // + rootCode + " divisor=" + indexDivisor);
     // }
     rootBlockFP =
-        (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
+        readLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length))
             >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
     // Initialize FST always off-heap.
     final IndexInput clone = indexIn.clone();
@@ -99,6 +102,26 @@ public final class FieldReader extends Terms {
     */
   }
 
+  long readLongOutput(DataInput in) throws IOException {
+    if (parent.version >= VERSION_MSB_VLONG_OUTPUT) {
+      return readMSBVLong(in);
+    } else {
+      return in.readVLong();
+    }
+  }
+
+  private static long readMSBVLong(DataInput in) throws IOException {
+    long l = 0L;
+    while (true) {
+      byte b = in.readByte();
+      l = (l << 7) | (b & 0x7FL);
+      if ((b & 0x80) == 0) {
+        break;
+      }
+    }
+    return l;
+  }
+
   @Override
   public BytesRef getMin() throws IOException {
     if (minTerm == null) {

diff --git a/...ne/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java b/...ne/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java
@@ -146,7 +146,7 @@ void load(BytesRef frameIndexData) throws IOException {
       floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
       // Skip first long -- has redundant fp, hasTerms
       // flag, isFloor flag
-      final long code = floorDataReader.readVLong();
+      final long code = ite.fr.readLongOutput(floorDataReader);
       if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
         // Floor frame
         numFollowFloorBlocks = floorDataReader.readVInt();

diff --git a/...re/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/...re/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java
@@ -81,8 +81,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
   /** Initial terms format. */
   public static final int VERSION_START = 0;
 
+  /** Version that uses MSB VLong encoded output */
+  public static final int VERSION_MSB_VLONG_OUTPUT = 1;
+
   /** Current terms format. */
-  public static final int VERSION_CURRENT = VERSION_START;
+  public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
 
   /** Extension of terms index file */
   static final String TERMS_INDEX_EXTENSION = "tip";

diff --git a/...re/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/...re/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java
@@ -460,6 +460,19 @@ public String toString() {
       return "BLOCK: prefix=" + brToString(prefix);
     }
 
+    private static void writeMSBVLong(long i, DataOutput scratchBytes) throws IOException {
+      assert i >= 0;
+      // Keep zero bits on most significant byte to have more chance to get prefix bytes shared.
+      // e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40]
+      int LSBVLongBytes = (Long.SIZE - Long.numberOfLeadingZeros(i) - 1) / 7 + 1;
+      i <<= Long.SIZE - LSBVLongBytes * 7;
+      for (int j = 1; j < LSBVLongBytes; j++) {
+        scratchBytes.writeByte((byte) (((i >>> 57) & 0x7FL) | 0x80));
+        i = i << 7;
+      }
+      scratchBytes.writeByte((byte) (((i >>> 57) & 0x7FL)));
+    }
+
     public void compileIndex(
         List<PendingBlock> blocks,
         ByteBuffersDataOutput scratchBytes,
@@ -472,10 +485,8 @@ public void compileIndex(
 
       assert scratchBytes.size() == 0;
 
-      // TODO: try writing the leading vLong in MSB order
-      // (opposite of what Lucene does today), for better
-      // outputs sharing in the FST
-      scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
+      // write the leading vLong in MSB order for better outputs sharing in the FST
+      writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes);
       if (isFloor) {
         scratchBytes.writeVInt(blocks.size() - 1);
         for (int i = 1; i < blocks.size(); i++) {

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
@@ -236,7 +236,7 @@ private FST.Arc<BytesRef> getArc(int ord) {
   SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
       throws IOException {
     scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
-    final long code = scratchReader.readVLong();
+    final long code = fr.readLongOutput(scratchReader);
     final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
     final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
     f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
@@ -980,7 +980,7 @@ private void printSeekState(PrintStream out) throws IOException {
           } else if (isSeekFrame && !f.isFloor) {
             final ByteArrayDataInput reader =
                 new ByteArrayDataInput(output.bytes, output.offset, output.length);
-            final long codeOrig = reader.readVLong();
+            final long codeOrig = fr.readLongOutput(reader);
             final long code =
                 (f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
                     | (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)