Skip to content

Commit 1761933

Browse files
committed
Optimize the compression of inverted index position information
1 parent 5a458e6 commit 1761933

16 files changed

+514
-25
lines changed

src/core/CLucene/index/FieldInfos.cpp

+13-5
Original file line numberDiff line numberDiff line change
@@ -125,23 +125,26 @@ void FieldInfos::addIndexed(const TCHAR** names, const bool storeTermVectors, co
125125

126126
void FieldInfos::add(const TCHAR** names, const bool isIndexed, const bool storeTermVectors,
127127
const bool storePositionWithTermVector, const bool storeOffsetWithTermVector,
128-
const bool omitNorms, const bool hasProx, const bool storePayloads) {
128+
const bool omitNorms, const bool hasProx, const bool storePayloads,
129+
IndexVersion indexVersion) {
129130
size_t i = 0;
130131
while ( names[i] != NULL ){
131132
add(names[i], isIndexed, storeTermVectors, storePositionWithTermVector,
132-
storeOffsetWithTermVector, omitNorms, hasProx, storePayloads);
133+
storeOffsetWithTermVector, omitNorms, hasProx, storePayloads, indexVersion);
133134
++i;
134135
}
135136
}
136137

137138
FieldInfo* FieldInfos::add(const TCHAR* name, const bool isIndexed, const bool storeTermVector,
138139
const bool storePositionWithTermVector,
139140
const bool storeOffsetWithTermVector, const bool omitNorms,
140-
const bool hasProx, const bool storePayloads) {
141+
const bool hasProx, const bool storePayloads,
142+
IndexVersion indexVersion) {
141143
FieldInfo* fi = fieldInfo(name);
142144
if (fi == NULL) {
143145
return addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector,
144-
storeOffsetWithTermVector, omitNorms, hasProx, storePayloads);
146+
storeOffsetWithTermVector, omitNorms, hasProx, storePayloads,
147+
indexVersion);
145148
} else {
146149
if (fi->isIndexed != isIndexed) {
147150
fi->isIndexed = true; // once indexed, always index
@@ -164,6 +167,9 @@ FieldInfo* FieldInfos::add(const TCHAR* name, const bool isIndexed, const bool s
164167
if (fi->storePayloads != storePayloads) {
165168
fi->storePayloads = true;
166169
}
170+
if (fi->indexVersion_ != indexVersion) {
171+
fi->indexVersion_ = indexVersion;
172+
}
167173
}
168174
return fi;
169175
}
@@ -172,10 +178,12 @@ FieldInfo* FieldInfos::addInternal(const TCHAR* name, const bool isIndexed,
172178
const bool storeTermVector,
173179
const bool storePositionWithTermVector,
174180
const bool storeOffsetWithTermVector, const bool omitNorms,
175-
const bool hasProx, const bool storePayloads) {
181+
const bool hasProx, const bool storePayloads,
182+
IndexVersion indexVersion) {
176183
FieldInfo* fi = _CLNEW FieldInfo(name, isIndexed, byNumber.size(), storeTermVector,
177184
storePositionWithTermVector, storeOffsetWithTermVector,
178185
omitNorms, hasProx, storePayloads);
186+
fi->setIndexVersion(indexVersion);
179187
byNumber.push_back(fi);
180188
byName.put( fi->name, fi);
181189
return fi;

src/core/CLucene/index/IndexVersion.h

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
enum class IndexVersion {
44
kV0 = 0,
55
kV1 = 1,
6+
kV2 = 2,
67

78
kNone
89
};

src/core/CLucene/index/IndexWriter.cpp

+27-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "CLucene/analysis/AnalysisHeader.h"
1010
#include "CLucene/analysis/Analyzers.h"
11+
#include "CLucene/config/repl_wchar.h"
1112
#include "CLucene/document/Document.h"
1213
#include "CLucene/search/Similarity.h"
1314
#include "CLucene/store/Directory.h"
@@ -1327,22 +1328,27 @@ void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
13271328
std::vector<lucene::index::IndexWriter *> destIndexWriterList;
13281329
std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList;
13291330
try {
1330-
// check hasProx
1331+
// check hasProx, indexVersion
13311332
bool hasProx = false;
1333+
IndexVersion indexVersion = IndexVersion::kV1;
13321334
{
13331335
if (!readers.empty()) {
13341336
IndexReader* reader = readers[0];
13351337
hasProx = reader->getFieldInfos()->hasProx();
1338+
indexVersion = reader->getFieldInfos()->getIndexVersion();
13361339
for (int32_t i = 1; i < readers.size(); i++) {
13371340
if (hasProx != readers[i]->getFieldInfos()->hasProx()) {
13381341
_CLTHROWA(CL_ERR_IllegalArgument, "src_dirs hasProx inconformity");
13391342
}
1343+
if (indexVersion != readers[i]->getFieldInfos()->getIndexVersion()) {
1344+
_CLTHROWA(CL_ERR_IllegalArgument, "src_dirs indexVersion inconformity");
1345+
}
13401346
}
13411347
}
13421348
}
13431349

13441350
/// merge fields
1345-
mergeFields(hasProx);
1351+
mergeFields(hasProx, indexVersion);
13461352

13471353
/// write fields and create files writers
13481354
for (int j = 0; j < numDestIndexes; j++) {
@@ -1390,7 +1396,7 @@ void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
13901396
}
13911397

13921398
/// merge terms
1393-
mergeTerms(hasProx);
1399+
mergeTerms(hasProx, indexVersion);
13941400

13951401
/// merge null_bitmap
13961402
mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList);
@@ -1555,7 +1561,7 @@ void IndexWriter::compareIndexes(lucene::store::Directory *other) {
15551561
}
15561562
}
15571563

1558-
void IndexWriter::mergeFields(bool hasProx) {
1564+
void IndexWriter::mergeFields(bool hasProx, IndexVersion indexVersion) {
15591565
//Create a new FieldInfos
15601566
fieldInfos = _CLNEW FieldInfos();
15611567
//Condition check to see if fieldInfos points to a valid instance
@@ -1570,7 +1576,8 @@ void IndexWriter::mergeFields(bool hasProx) {
15701576
FieldInfo *fi = reader->getFieldInfos()->fieldInfo(j);
15711577
fieldInfos->add(fi->name, fi->isIndexed, fi->storeTermVector,
15721578
fi->storePositionWithTermVector, fi->storeOffsetWithTermVector,
1573-
!reader->hasNorms(fi->name), hasProx, fi->storePayloads);
1579+
!reader->hasNorms(fi->name), hasProx, fi->storePayloads,
1580+
fi->indexVersion_);
15741581
}
15751582
}
15761583

@@ -1614,7 +1621,7 @@ class postingQueue : public CL_NS(util)::PriorityQueue<DestDoc*,CL_NS(util)::Del
16141621

16151622
};
16161623

1617-
void IndexWriter::mergeTerms(bool hasProx) {
1624+
void IndexWriter::mergeTerms(bool hasProx, IndexVersion indexVersion) {
16181625
auto queue = _CLNEW SegmentMergeQueue(readers.size());
16191626
auto numSrcIndexes = readers.size();
16201627
//std::vector<TermPositions *> postingsList(numSrcIndexes);
@@ -1667,6 +1674,7 @@ void IndexWriter::mergeTerms(bool hasProx) {
16671674

16681675
std::vector<std::vector<uint32_t>> docDeltaBuffers(numDestIndexes);
16691676
std::vector<std::vector<uint32_t>> freqBuffers(numDestIndexes);
1677+
std::vector<std::vector<uint32_t>> posBuffers(numDestIndexes);
16701678
auto destPostingQueues = _CLNEW postingQueue(matchSize);
16711679
std::vector<DestDoc> destDocs(matchSize);
16721680

@@ -1758,6 +1766,7 @@ void IndexWriter::mergeTerms(bool hasProx) {
17581766
auto proxOut = proxOutputList[destIdx];
17591767
auto& docDeltaBuffer = docDeltaBuffers[destIdx];
17601768
auto& freqBuffer = freqBuffers[destIdx];
1769+
auto& posBuffer = posBuffers[destIdx];
17611770
auto skipWriter = skipListWriterList[destIdx];
17621771
auto& df = dfs[destIdx];
17631772
auto& lastDoc = lastDocs[destIdx];
@@ -1776,6 +1785,9 @@ void IndexWriter::mergeTerms(bool hasProx) {
17761785
encode(freqOut, docDeltaBuffer, true);
17771786
if (hasProx) {
17781787
encode(freqOut, freqBuffer, false);
1788+
if (indexVersion >= IndexVersion::kV2) {
1789+
PforUtil::encodePos(proxOut, posBuffer);
1790+
}
17791791
}
17801792

17811793
skipWriter->setSkipData(lastDoc, false, -1);
@@ -1791,7 +1803,11 @@ void IndexWriter::mergeTerms(bool hasProx) {
17911803
for (int32_t i = 0; i < descPositions.size(); i++) {
17921804
int32_t position = descPositions[i];
17931805
int32_t delta = position - lastPosition;
1794-
proxOut->writeVInt(delta);
1806+
if (indexVersion >= IndexVersion::kV2) {
1807+
posBuffer.push_back(delta);
1808+
} else {
1809+
proxOut->writeVInt(delta);
1810+
}
17951811
lastPosition = position;
17961812
}
17971813
freqBuffer.push_back(destFreq);
@@ -1828,6 +1844,7 @@ void IndexWriter::mergeTerms(bool hasProx) {
18281844
{
18291845
auto& docDeltaBuffer = docDeltaBuffers[i];
18301846
auto& freqBuffer = freqBuffers[i];
1847+
auto& posBuffer = posBuffers[i];
18311848

18321849
freqOutput->writeByte((char)CodeMode::kDefault);
18331850
freqOutput->writeVInt(docDeltaBuffer.size());
@@ -1851,6 +1868,9 @@ void IndexWriter::mergeTerms(bool hasProx) {
18511868
}
18521869
docDeltaBuffer.resize(0);
18531870
freqBuffer.resize(0);
1871+
if (indexVersion >= IndexVersion::kV2) {
1872+
PforUtil::encodePos(proxOutput, posBuffer);
1873+
}
18541874
}
18551875

18561876
int64_t skipPointer = skipListWriter->writeSkip(freqOutput);

src/core/CLucene/index/IndexWriter.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#ifndef _lucene_index_IndexWriter_
88
#define _lucene_index_IndexWriter_
99

10+
#include "CLucene/index/IndexVersion.h"
1011
#include "CLucene/util/VoidList.h"
1112
#include "CLucene/util/Array.h"
1213

@@ -320,11 +321,11 @@ class CLUCENE_EXPORT IndexWriter:LUCENE_BASE {
320321
std::vector<uint32_t> dest_index_docs);
321322

322323
// create new fields info
323-
void mergeFields(bool hasProx);
324+
void mergeFields(bool hasProx, IndexVersion indexVersion);
324325
// write fields info file
325326
void writeFields(lucene::store::Directory* d, std::string segment);
326327
// merge terms and write files
327-
void mergeTerms(bool hasProx);
328+
void mergeTerms(bool hasProx, IndexVersion indexVersion);
328329
// merge null_bitmap
329330
void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues, std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList);
330331

src/core/CLucene/index/SDocumentWriter.cpp

+11-1
Original file line numberDiff line numberDiff line change
@@ -1222,6 +1222,9 @@ void SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
12221222
encode(freqOut, docDeltaBuffer, true);
12231223
if (hasProx_) {
12241224
encode(freqOut, freqBuffer, false);
1225+
if (indexVersion_ >= IndexVersion::kV2) {
1226+
PforUtil::encodePos(proxOut, posBuffer);
1227+
}
12251228
}
12261229

12271230
skipListWriter->setSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
@@ -1253,7 +1256,11 @@ void SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
12531256
for (int32_t j = 0; j < termDocFreq; j++) {
12541257
const int32_t code = prox.readVInt();
12551258
assert(0 == (code & 1));
1256-
proxOut->writeVInt(code >> 1);
1259+
if (indexVersion_ >= IndexVersion::kV2) {
1260+
posBuffer.push_back(code >> 1);
1261+
} else {
1262+
proxOut->writeVInt(code >> 1);
1263+
}
12571264
}
12581265
freqBuffer.push_back(termDocFreq);
12591266
}
@@ -1310,6 +1317,9 @@ void SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
13101317
}
13111318
docDeltaBuffer.resize(0);
13121319
freqBuffer.resize(0);
1320+
if (indexVersion_ >= IndexVersion::kV2) {
1321+
PforUtil::encodePos(proxOut, posBuffer);
1322+
}
13131323
}
13141324

13151325
int64_t skipPointer = skipListWriter->writeSkip(freqOut);

src/core/CLucene/index/SDocumentWriter.h

+1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class SDocumentsWriter : public IDocumentsWriter {
5252
std::string segment;// Current segment we are working on
5353
std::vector<uint32_t> docDeltaBuffer;
5454
std::vector<uint32_t> freqBuffer;
55+
std::vector<uint32_t> posBuffer;
5556
std::ostream* infoStream{};
5657
int64_t ramBufferSize;
5758
// Flush @ this number of docs. If rarmBufferSize is

src/core/CLucene/index/SegmentTermDocs.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ void TermDocsBuffer::refill() {
190190
cur_doc_ = 0;
191191
cur_freq_ = 0;
192192

193-
if (indexVersion_ == IndexVersion::kV1) {
193+
if (indexVersion_ >= IndexVersion::kV1) {
194194
size_ = refillV1();
195195
} else {
196196
size_ = refillV0();
@@ -199,7 +199,7 @@ void TermDocsBuffer::refill() {
199199

200200
void TermDocsBuffer::readRange(DocRange* docRange) {
201201
int32_t size = 0;
202-
if (indexVersion_ == IndexVersion::kV1) {
202+
if (indexVersion_ >= IndexVersion::kV1) {
203203
size = refillV1();
204204
} else {
205205
size = refillV0();

src/core/CLucene/index/SegmentTermPositions.cpp

+11-5
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ CL_NS_DEF(index)
1717
SegmentTermPositions::SegmentTermPositions(const SegmentReader* _parent):
1818
SegmentTermDocs(_parent), proxStream(NULL)// the proxStream will be cloned lazily when nextPosition() is called for the first time
1919
,lazySkipPointer(-1), lazySkipProxCount(0)
20+
, indexVersion_(_parent->_fieldInfos->getIndexVersion())
21+
, buffer_(proxStream, indexVersion_)
2022
{
2123
CND_CONDITION(_parent != NULL, "Parent is NULL");
2224
}
@@ -64,14 +66,15 @@ int32_t SegmentTermPositions::nextPosition() {
6466
}
6567

6668
int32_t SegmentTermPositions::readDeltaPosition() {
67-
int32_t delta = proxStream->readVInt();
69+
int32_t delta = buffer_.getPos();
6870
if (currentFieldStoresPayloads) {
6971
// if the current field stores payloads then
7072
// the position delta is shifted one bit to the left.
7173
// if the LSB is set, then we have to read the current
7274
// payload length
7375
if ((delta & 1) != 0) {
74-
payloadLength = proxStream->readVInt();
76+
// payloadLength = proxStream->readVInt();
77+
_CLTHROWA(CL_ERR_UnsupportedOperation, "Processing the payload flow is not supported at the moment");
7578
}
7679
delta = (int32_t)((uint32_t)delta >> (uint32_t)1);
7780
needToLoadPayload = true;
@@ -122,7 +125,8 @@ void SegmentTermPositions::skipPositions(const int32_t n) {
122125

123126
void SegmentTermPositions::skipPayload() {
124127
if (needToLoadPayload && payloadLength > 0) {
125-
proxStream->seek(proxStream->getFilePointer() + payloadLength);
128+
// proxStream->seek(proxStream->getFilePointer() + payloadLength);
129+
_CLTHROWA(CL_ERR_UnsupportedOperation, "Processing the payload flow is not supported at the moment");
126130
}
127131
needToLoadPayload = false;
128132
}
@@ -131,14 +135,15 @@ void SegmentTermPositions::lazySkip() {
131135
if (proxStream == NULL) {
132136
// clone lazily
133137
proxStream = parent->proxStream->clone();
138+
buffer_.reset(proxStream);
134139
}
135140

136141
// we might have to skip the current payload
137142
// if it was not read yet
138143
skipPayload();
139144

140145
if (lazySkipPointer != -1) {
141-
proxStream->seek(lazySkipPointer);
146+
buffer_.seek(lazySkipPointer);
142147
lazySkipPointer = -1;
143148
}
144149

@@ -166,7 +171,8 @@ uint8_t* SegmentTermPositions::getPayload(uint8_t* data) {
166171
} else {
167172
retArray = data;
168173
}
169-
proxStream->readBytes(retArray, payloadLength);
174+
// proxStream->readBytes(retArray, payloadLength);
175+
_CLTHROWA(CL_ERR_UnsupportedOperation, "Processing the payload flow is not supported at the moment");
170176
needToLoadPayload = false;
171177
return retArray;
172178
}

src/core/CLucene/index/_FieldInfos.h

+6-3
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,8 @@ class CLUCENE_EXPORT FieldInfos :LUCENE_BASE{
146146
void add(const TCHAR** names, const bool isIndexed, const bool storeTermVector = false,
147147
const bool storePositionWithTermVector = false,
148148
const bool storeOffsetWithTermVector = false, const bool omitNorms = false,
149-
const bool hasProx = false, const bool storePayloads = false);
149+
const bool hasProx = false, const bool storePayloads = false,
150+
IndexVersion indexVersion = IndexVersion::kV1);
150151

151152
// Merges in information from another FieldInfos.
152153
void add(FieldInfos* other);
@@ -167,13 +168,15 @@ class CLUCENE_EXPORT FieldInfos :LUCENE_BASE{
167168
FieldInfo* add(const TCHAR* name, const bool isIndexed, const bool storeTermVector = false,
168169
const bool storePositionWithTermVector = false,
169170
const bool storeOffsetWithTermVector = false, const bool omitNorms = false,
170-
const bool hasProx = false, const bool storePayloads = false);
171+
const bool hasProx = false, const bool storePayloads = false,
172+
IndexVersion indexVersion = IndexVersion::kV1);
171173

172174
// was void
173175
FieldInfo* addInternal(const TCHAR* name, const bool isIndexed, const bool storeTermVector,
174176
const bool storePositionWithTermVector,
175177
const bool storeOffsetWithTermVector, const bool omitNorms,
176-
const bool hasProx, const bool storePayloads);
178+
const bool hasProx, const bool storePayloads,
179+
IndexVersion indexVersion = IndexVersion::kV1);
177180

178181
int32_t fieldNumber(const TCHAR* fieldName)const;
179182

0 commit comments

Comments
 (0)