Skip to content

Commit

Permalink
Optimize outputs accumulating for SegmentTermsEnum and IntersectTerms…
Browse files Browse the repository at this point in the history
…Enum (apache#12699)
  • Loading branch information
gf2121 authored Nov 28, 2023
1 parent 38ca8d3 commit 9574cbd
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 78 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@
import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.automaton.TransitionAccessor;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;

/**
* This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot
Expand All @@ -46,7 +44,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;

final IndexInput in;
static final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();

IntersectTermsEnumFrame[] stack;

Expand All @@ -68,6 +65,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {

private BytesRef savedStartTerm;

private final SegmentTermsEnum.OutputAccumulator outputAccumulator =
new SegmentTermsEnum.OutputAccumulator();

// TODO: in some cases we can filter by length? eg
// regexp foo*bar must be at least length 6 bytes
public IntersectTermsEnum(
Expand Down Expand Up @@ -114,7 +114,6 @@ public IntersectTermsEnum(
f.prefix = 0;
f.setState(0);
f.arc = arc;
f.outputPrefix = arc.output();
f.load(fr.rootCode);

// for assert:
Expand Down Expand Up @@ -184,22 +183,24 @@ private IntersectTermsEnumFrame pushFrame(int state) throws IOException {
FST.Arc<BytesRef> arc = currentFrame.arc;
int idx = currentFrame.prefix;
assert currentFrame.suffix > 0;
BytesRef output = currentFrame.outputPrefix;

outputAccumulator.reset();
outputAccumulator.push(arc.output());
while (idx < f.prefix) {
final int target = term.bytes[idx] & 0xff;
// TODO: we could be more efficient for the next()
// case by using current arc as starting point,
// passed to findTargetArc
arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
assert arc != null;
output = fstOutputs.add(output, arc.output());
outputAccumulator.push(arc.output());
idx++;
}

f.arc = arc;
f.outputPrefix = output;
assert arc.isFinal();
f.load(fstOutputs.add(output, arc.nextFinalOutput()));
outputAccumulator.push(arc.nextFinalOutput());
f.load(outputAccumulator);
return f;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ final class IntersectTermsEnumFrame {
int statsSingletonRunLength = 0;
final ByteArrayDataInput statsReader = new ByteArrayDataInput();

byte[] floorData = new byte[32];
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();

// Length of prefix shared by all terms in this block
Expand Down Expand Up @@ -90,9 +89,6 @@ final class IntersectTermsEnumFrame {

final ByteArrayDataInput bytesReader = new ByteArrayDataInput();

// Cumulative output so far
BytesRef outputPrefix;

int startBytePos;
int suffix;

Expand Down Expand Up @@ -120,7 +116,7 @@ void loadNextFloorBlock() throws IOException {
}
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);

load(null);
load((Long) null);
}

public void setState(int state) {
Expand All @@ -142,12 +138,22 @@ public void setState(int state) {
}

void load(BytesRef frameIndexData) throws IOException {
if (frameIndexData != null) {
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
// Skip first long -- has redundant fp, hasTerms
// flag, isFloor flag
final long code = ite.fr.readVLongOutput(floorDataReader);
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
load(ite.fr.readVLongOutput(floorDataReader));
}

void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws IOException {
outputAccumulator.prepareRead();
long code = ite.fr.readVLongOutput(outputAccumulator);
outputAccumulator.setFloorData(floorDataReader);
load(code);
}

void load(Long blockCode) throws IOException {
if (blockCode != null) {
// This block is the first one in a possible sequence of floor blocks corresponding to a
// single seek point from the FST terms index
if ((blockCode & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
// Floor frame
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;
Expand Down
Loading

0 comments on commit 9574cbd

Please sign in to comment.