From 96052891e62b2fc9300e619fff0fb29aa8a4e29a Mon Sep 17 00:00:00 2001 From: gf2121 <52390227+gf2121@users.noreply.github.com> Date: Wed, 4 Oct 2023 01:58:56 -0500 Subject: [PATCH] Reduce FST block size for BlockTreeTermsWriter (#12604) --- lucene/CHANGES.txt | 3 +++ .../blocktree/Lucene90BlockTreeTermsWriter.java | 13 +++++++++++++ .../src/java/org/apache/lucene/util/fst/FST.java | 4 ++++ 3 files changed, 20 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 93d2563b39de..e02da0a9f7da 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -167,6 +167,9 @@ Optimizations * GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng) +* GITHUB#12604: Estimate the block size of FST BytesStore in BlockTreeTermsWriter + to reduce GC load during indexing. (Guo Feng) + Changes in runtime behavior --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index a7eb438489f9..1b0bd3568c9f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -52,6 +52,7 @@ import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.PackedInts; /* TODO: @@ -490,10 +491,22 @@ public void compileIndex( } } + long estimateSize = prefix.length; + for (PendingBlock block : blocks) { + if (block.subIndices != null) { + for (FST subIndex : block.subIndices) { + estimateSize += subIndex.numBytes(); + } + } + } + int estimateBitsRequired = PackedInts.bitsRequired(estimateSize); + int pageBits = Math.min(15, Math.max(6, estimateBitsRequired)); + final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs) .shouldShareNonSingletonNodes(false) + .bytesPageBits(pageBits) .build(); // if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 816a28572681..fb356c2c9c7e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -520,6 +520,10 @@ void finish(long newStartNode) throws IOException { bytes.finish(); } + public long numBytes() { + return bytes.getPosition(); + } + public T getEmptyOutput() { return emptyOutput; }