diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 736b7f2f4ef1..0552300acaed 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -178,6 +178,9 @@ Optimizations * GITHUB#12623: Use a MergeSorter taking advantage of extra storage for StableMSBRadixSorter. (Guo Feng) +* GITHUB#12623: Write MSB VLong for better outputs sharing in block tree index, decreasing ~14% size + of .tip file. (Guo Feng) + Changes in runtime behavior --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java index 7aba78112e8b..39caaeb622dc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java @@ -16,12 +16,15 @@ */ package org.apache.lucene.codecs.lucene90.blocktree; +import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT; + import java.io.IOException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CompiledAutomaton; @@ -82,7 +85,7 @@ public final class FieldReader extends Terms { // + rootCode + " divisor=" + indexDivisor); // } rootBlockFP = - (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() + readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)) >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; // Initialize FST always off-heap. final IndexInput clone = indexIn.clone(); @@ -99,6 +102,32 @@ public final class FieldReader extends Terms { */ } + long readVLongOutput(DataInput in) throws IOException { + if (parent.version >= VERSION_MSB_VLONG_OUTPUT) { + return readMSBVLong(in); + } else { + return in.readVLong(); + } + } + + /** + * Decodes a variable length byte[] in MSB order back to long, as written by {@link + * Lucene90BlockTreeTermsWriter#writeMSBVLong}. + * + *

Package private for testing. + */ + static long readMSBVLong(DataInput in) throws IOException { + long l = 0L; + while (true) { + byte b = in.readByte(); + l = (l << 7) | (b & 0x7FL); + if ((b & 0x80) == 0) { + break; + } + } + return l; + } + @Override public BytesRef getMin() throws IOException { if (minTerm == null) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java index eb60d7f35246..d9ca7a9bbd81 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java @@ -146,7 +146,7 @@ void load(BytesRef frameIndexData) throws IOException { floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); // Skip first long -- has redundant fp, hasTerms // flag, isFloor flag - final long code = floorDataReader.readVLong(); + final long code = ite.fr.readVLongOutput(floorDataReader); if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { // Floor frame numFollowFloorBlocks = floorDataReader.readVInt(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java index 4054e7a719cd..1b385f8fa09f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java @@ -81,8 +81,13 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer { /** Initial terms format. */ public static final int VERSION_START = 0; + /** + * Version that encode output as MSB VLong for better outputs sharing in FST, see GITHUB#12620. + */ + public static final int VERSION_MSB_VLONG_OUTPUT = 1; + /** Current terms format. */ - public static final int VERSION_CURRENT = VERSION_START; + public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT; /** Extension of terms index file */ static final String TERMS_INDEX_EXTENSION = "tip"; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 1b0bd3568c9f..d0114ce0e79d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -430,6 +430,25 @@ static String brToString(byte[] b) { return brToString(new BytesRef(b)); } + /** + * Encodes long value to variable length byte[], in MSB order. Use {@link + * FieldReader#readMSBVLong} to decode. + * + *

Package private for testing + */ + static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException { + assert l >= 0; + // Keep zero bits on most significant byte to have more chance to get prefix bytes shared. + // e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40] + final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1; + l <<= Long.SIZE - bytesNeeded * 7; + for (int i = 1; i < bytesNeeded; i++) { + scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80)); + l = l << 7; + } + scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL))); + } + private static final class PendingBlock extends PendingEntry { public final BytesRef prefix; public final long fp; @@ -472,10 +491,8 @@ public void compileIndex( assert scratchBytes.size() == 0; - // TODO: try writing the leading vLong in MSB order - // (opposite of what Lucene does today), for better - // outputs sharing in the FST - scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor)); + // write the leading vLong in MSB order for better outputs sharing in the FST + writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes); if (isFloor) { scratchBytes.writeVInt(blocks.size() - 1); for (int i = 1; i < blocks.size(); i++) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index 0e865061cf0e..cb5577d8d6c9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -236,7 +236,7 @@ private FST.Arc getArc(int ord) { SegmentTermsEnumFrame pushFrame(FST.Arc arc, BytesRef frameData, int length) throws IOException { scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); - final long code = scratchReader.readVLong(); + final long code = fr.readVLongOutput(scratchReader); final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0; @@ -980,7 +980,7 @@ private void printSeekState(PrintStream out) throws IOException { } else if (isSeekFrame && !f.isFloor) { final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length); - final long codeOrig = reader.readVLong(); + final long codeOrig = fr.readVLongOutput(reader); final long code = (f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java new file mode 100644 index 000000000000..1ebab9262099 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.ArrayUtil; + +public class TestMSBVLong extends LuceneTestCase { + + public void testMSBVLong() throws IOException { + assertMSBVLong(Long.MAX_VALUE); + int iter = atLeast(10000); + for (long i = 0; i < iter; i++) { + assertMSBVLong(i); + } + } + + private static void assertMSBVLong(long l) throws IOException { + byte[] bytes = new byte[10]; + ByteArrayDataOutput output = new ByteArrayDataOutput(bytes); + Lucene90BlockTreeTermsWriter.writeMSBVLong(l, output); + ByteArrayDataInput in = + new ByteArrayDataInput(ArrayUtil.copyOfSubArray(bytes, 0, output.getPosition())); + long recovered = FieldReader.readMSBVLong(in); + assertEquals(l + " != " + recovered, l, recovered); + } +}