diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 736b7f2f4ef1..0552300acaed 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -178,6 +178,9 @@ Optimizations * GITHUB#12623: Use a MergeSorter taking advantage of extra storage for StableMSBRadixSorter. (Guo Feng) +* GITHUB#12623: Write MSB VLong for better outputs sharing in block tree index, decreasing ~14% size + of .tip file. (Guo Feng) + Changes in runtime behavior --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java index 7aba78112e8b..39caaeb622dc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java @@ -16,12 +16,15 @@ */ package org.apache.lucene.codecs.lucene90.blocktree; +import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT; + import java.io.IOException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CompiledAutomaton; @@ -82,7 +85,7 @@ public final class FieldReader extends Terms { // + rootCode + " divisor=" + indexDivisor); // } rootBlockFP = - (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() + readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; // Initialize FST always off-heap. final IndexInput clone = indexIn.clone(); @@ -99,6 +102,32 @@ public final class FieldReader extends Terms { */ } + long readVLongOutput(DataInput in) throws IOException { + if (parent.version >= VERSION_MSB_VLONG_OUTPUT) { + return readMSBVLong(in); + } else { + return in.readVLong(); + } + } + + /** + * Decodes a variable length byte[] in MSB order back to long, as written by {@link + * Lucene90BlockTreeTermsWriter#writeMSBVLong}. + * + *
Package private for testing. + */ + static long readMSBVLong(DataInput in) throws IOException { + long l = 0L; + while (true) { + byte b = in.readByte(); + l = (l << 7) | (b & 0x7FL); + if ((b & 0x80) == 0) { + break; + } + } + return l; + } + @Override public BytesRef getMin() throws IOException { if (minTerm == null) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java index eb60d7f35246..d9ca7a9bbd81 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java @@ -146,7 +146,7 @@ void load(BytesRef frameIndexData) throws IOException { floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); // Skip first long -- has redundant fp, hasTerms // flag, isFloor flag - final long code = floorDataReader.readVLong(); + final long code = ite.fr.readVLongOutput(floorDataReader); if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { // Floor frame numFollowFloorBlocks = floorDataReader.readVInt(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java index 4054e7a719cd..1b385f8fa09f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java @@ -81,8 +81,13 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer { /** Initial terms format. */ public static final int VERSION_START = 0; + /** + * Version that encode output as MSB VLong for better outputs sharing in FST, see GITHUB#12620. + */ + public static final int VERSION_MSB_VLONG_OUTPUT = 1; + /** Current terms format. */ - public static final int VERSION_CURRENT = VERSION_START; + public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT; /** Extension of terms index file */ static final String TERMS_INDEX_EXTENSION = "tip"; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 1b0bd3568c9f..d0114ce0e79d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -430,6 +430,25 @@ static String brToString(byte[] b) { return brToString(new BytesRef(b)); } + /** + * Encodes long value to variable length byte[], in MSB order. Use {@link + * FieldReader#readMSBVLong} to decode. + * + *
Package private for testing
+ */
+ static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException {
+ assert l >= 0;
+ // Keep zero bits on most significant byte to have more chance to get prefix bytes shared.
+ // e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40]
+ final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1;
+ l <<= Long.SIZE - bytesNeeded * 7;
+ for (int i = 1; i < bytesNeeded; i++) {
+ scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80));
+ l = l << 7;
+ }
+ scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL)));
+ }
+
private static final class PendingBlock extends PendingEntry {
public final BytesRef prefix;
public final long fp;
@@ -472,10 +491,8 @@ public void compileIndex(
assert scratchBytes.size() == 0;
- // TODO: try writing the leading vLong in MSB order
- // (opposite of what Lucene does today), for better
- // outputs sharing in the FST
- scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
+ // write the leading vLong in MSB order for better outputs sharing in the FST
+ writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes);
if (isFloor) {
scratchBytes.writeVInt(blocks.size() - 1);
for (int i = 1; i < blocks.size(); i++) {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
index 0e865061cf0e..cb5577d8d6c9 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
@@ -236,7 +236,7 @@ private FST.Arc