Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write MSB VLong for better outputs sharing in block tree index #12631

Merged
merged 18 commits into from
Oct 10, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@
*/
package org.apache.lucene.codecs.lucene90.blocktree;

import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT;

import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
Expand Down Expand Up @@ -82,7 +85,7 @@ public final class FieldReader extends Terms {
// + rootCode + " divisor=" + indexDivisor);
// }
rootBlockFP =
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
readLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length))
>>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
// Initialize FST always off-heap.
final IndexInput clone = indexIn.clone();
Expand All @@ -99,6 +102,26 @@ public final class FieldReader extends Terms {
*/
}

long readLongOutput(DataInput in) throws IOException {
gf2121 marked this conversation as resolved.
Show resolved Hide resolved
if (parent.version >= VERSION_MSB_VLONG_OUTPUT) {
return readMSBVLong(in);
} else {
return in.readVLong();
}
}

private static long readMSBVLong(DataInput in) throws IOException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a private javadoc linking to the write method? E.g.

/** Decodes a variable length byte[] in MSB order back to long, as written by Lucene90BlockTreeTermsWriter.writeMSBVLong. Use FieldReader#readMSBVlong to decode. */

long l = 0L;
while (true) {
byte b = in.readByte();
l = (l << 7) | (b & 0x7FL);
if ((b & 0x80) == 0) {
break;
}
}
return l;
}

@Override
public BytesRef getMin() throws IOException {
if (minTerm == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ void load(BytesRef frameIndexData) throws IOException {
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
// Skip first long -- has redundant fp, hasTerms
// flag, isFloor flag
final long code = floorDataReader.readVLong();
final long code = ite.fr.readLongOutput(floorDataReader);
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
// Floor frame
numFollowFloorBlocks = floorDataReader.readVInt();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
/** Initial terms format. */
public static final int VERSION_START = 0;

/** Version that uses MSB VLong encoded output */
public static final int VERSION_MSB_VLONG_OUTPUT = 1;

/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_START;
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment explaining what changed, and link to the GitHub issue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wrote some comments on VERSION_MSB_VLONG_OUTPUT. I prefer to leave comments there, as VERSION_CURRENT could probably link to other version in feature. WDYT?

Issue added :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, sorry, yes - VERSION_MSB_VLONG_OUTPUT is the right place for the comment! Thanks.


/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,19 @@ public String toString() {
return "BLOCK: prefix=" + brToString(prefix);
}

private static void writeMSBVLong(long i, DataOutput scratchBytes) throws IOException {
gf2121 marked this conversation as resolved.
Show resolved Hide resolved
assert i >= 0;
// Keep zero bits on most significant byte to have more chance to get prefix bytes shared.
// e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40]
int LSBVLongBytes = (Long.SIZE - Long.numberOfLeadingZeros(i) - 1) / 7 + 1;
gf2121 marked this conversation as resolved.
Show resolved Hide resolved
i <<= Long.SIZE - LSBVLongBytes * 7;
for (int j = 1; j < LSBVLongBytes; j++) {
scratchBytes.writeByte((byte) (((i >>> 57) & 0x7FL) | 0x80));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe @uschindler can help review this heavy math :)

i = i << 7;
}
scratchBytes.writeByte((byte) (((i >>> 57) & 0x7FL)));
}

public void compileIndex(
List<PendingBlock> blocks,
ByteBuffersDataOutput scratchBytes,
Expand All @@ -472,10 +485,8 @@ public void compileIndex(

assert scratchBytes.size() == 0;

// TODO: try writing the leading vLong in MSB order
// (opposite of what Lucene does today), for better
// outputs sharing in the FST
scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
// write the leading vLong in MSB order for better outputs sharing in the FST
writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes);
if (isFloor) {
scratchBytes.writeVInt(blocks.size() - 1);
for (int i = 1; i < blocks.size(); i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ private FST.Arc<BytesRef> getArc(int ord) {
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
throws IOException {
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
final long code = scratchReader.readVLong();
final long code = fr.readLongOutput(scratchReader);
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
Expand Down Expand Up @@ -980,7 +980,7 @@ private void printSeekState(PrintStream out) throws IOException {
} else if (isSeekFrame && !f.isFloor) {
final ByteArrayDataInput reader =
new ByteArrayDataInput(output.bytes, output.offset, output.length);
final long codeOrig = reader.readVLong();
final long codeOrig = fr.readLongOutput(reader);
final long code =
(f.fp << Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
| (f.hasTerms ? Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)
Expand Down
Loading