Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[opt](inverted index) add performance profiling for remote io access in inverted index #250

Merged
merged 1 commit into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/core/CLucene/index/IndexReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ CL_NS_DEF(index)
return SegmentInfos::getCurrentSegmentGeneration(directory) != -1;
}

TermDocs* IndexReader::termDocs(Term* term) {
TermDocs* IndexReader::termDocs(Term* term, const void* io_ctx) {
//Func - Returns an enumeration of all the documents which contain
// term. For each document, the document number, the frequency of
// the term in that document is also provided, for use in search scoring.
Expand All @@ -268,14 +268,14 @@ CL_NS_DEF(index)

ensureOpen();
//Reference an instantiated TermDocs instance
TermDocs* _termDocs = termDocs();
TermDocs* _termDocs = termDocs(io_ctx);
//Seek all documents containing term
_termDocs->seek(term);
//return the enumaration
return _termDocs;
}

TermPositions* IndexReader::termPositions(Term* term){
TermPositions* IndexReader::termPositions(Term* term, const void* io_ctx){
//Func - Returns an enumeration of all the documents which contain term. For each
// document, in addition to the document number and frequency of the term in
// that document, a list of all of the ordinal positions of the term in the document
Expand All @@ -294,7 +294,7 @@ CL_NS_DEF(index)

ensureOpen();
//Reference an instantiated termPositions instance
TermPositions* _termPositions = termPositions();
TermPositions* _termPositions = termPositions(io_ctx);
//Seek all documents containing term
_termPositions->seek(term);
//return the enumeration
Expand Down
8 changes: 4 additions & 4 deletions src/core/CLucene/index/IndexReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ class CLUCENE_EXPORT IndexReader: public CL_NS(util)::NamedObject{
* @throws IOException if there is a low-level IO error
* @memory Caller must clean up
*/
virtual TermPositions* termPositions() = 0;
virtual TermPositions* termPositions(const void* io_ctx = nullptr) = 0;

/** Returns an enumeration of all the documents which contain
* <code>term</code>. For each document, in addition to the document number
Expand All @@ -584,13 +584,13 @@ class CLUCENE_EXPORT IndexReader: public CL_NS(util)::NamedObject{
* @throws IOException if there is a low-level IO error
* @memory Caller must clean up
*/
TermPositions* termPositions(Term* term);
TermPositions* termPositions(Term* term, const void* io_ctx = nullptr);

/** Returns an unpositioned {@link TermDocs} enumerator.
* @throws IOException if there is a low-level IO error
* @memory Caller must clean up
*/
virtual TermDocs* termDocs() = 0;
virtual TermDocs* termDocs(const void* io_ctx = nullptr) = 0;

/** Returns an enumeration of all the documents which contain
* <code>term</code>. For each document, the document number, the frequency of
Expand All @@ -602,7 +602,7 @@ class CLUCENE_EXPORT IndexReader: public CL_NS(util)::NamedObject{
* @throws IOException if there is a low-level IO error
* @memory Caller must clean up
*/
TermDocs* termDocs(Term* term);
TermDocs* termDocs(Term* term, const void* io_ctx = nullptr);

/** Deletes the document numbered <code>docNum</code>. Once a document is
* deleted it will not appear in TermDocs or TermPostitions enumerations.
Expand Down
6 changes: 4 additions & 2 deletions src/core/CLucene/index/MultiReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,15 +271,17 @@ int32_t MultiReader::docFreq(const Term* t) {
return total;
}

TermDocs* MultiReader::termDocs() {
TermDocs* MultiReader::termDocs(const void* io_ctx) {
ensureOpen();
TermDocs* ret = _CLNEW MultiTermDocs(subReaders, starts);
ret->setIoContext(io_ctx);
return ret;
}

TermPositions* MultiReader::termPositions() {
TermPositions* MultiReader::termPositions(const void* io_ctx) {
ensureOpen();
TermPositions* ret = (TermPositions*)_CLNEW MultiTermPositions(subReaders, starts);
ret->setIoContext(io_ctx);
return ret;
}

Expand Down
4 changes: 2 additions & 2 deletions src/core/CLucene/index/MultiReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ class CLUCENE_EXPORT MultiReader:public IndexReader{

//Returns the document frequency of the current term in the set
int32_t docFreq(const Term* t=NULL);
TermDocs* termDocs();
TermPositions* termPositions();
TermDocs* termDocs(const void* io_ctx = nullptr);
TermPositions* termPositions(const void* io_ctx = nullptr);

/**
* @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption)
Expand Down
14 changes: 10 additions & 4 deletions src/core/CLucene/index/MultiSegmentReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,15 +355,17 @@ int32_t MultiSegmentReader::docFreq(const Term* t) {
return total;
}

TermDocs* MultiSegmentReader::termDocs() {
TermDocs* MultiSegmentReader::termDocs(const void* io_ctx) {
ensureOpen();
TermDocs* ret = _CLNEW MultiTermDocs(subReaders, starts);
ret->setIoContext(io_ctx);
return ret;
}

TermPositions* MultiSegmentReader::termPositions() {
TermPositions* MultiSegmentReader::termPositions(const void* io_ctx) {
ensureOpen();
TermPositions* ret = static_cast<TermPositions*>(_CLNEW MultiTermPositions(subReaders, starts));
ret->setIoContext(io_ctx);
return ret;
}

Expand Down Expand Up @@ -559,6 +561,10 @@ int32_t MultiTermDocs::docFreq() {
return docFreq;
}

void MultiTermDocs::setIoContext(const void* io_ctx) {
io_ctx_ = io_ctx;
}

int32_t MultiTermDocs::doc() const {
CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was called");
// if not found term, current will return INT_MAX, we could not add base, otherwise it will overflow.
Expand Down Expand Up @@ -724,7 +730,7 @@ void MultiTermDocs::close() {
}

TermDocs* MultiTermDocs::termDocs(IndexReader* reader) {
return reader->termDocs();
return reader->termDocs(io_ctx_);
}

TermDocs* MultiTermDocs::termDocs(const int32_t i) {
Expand Down Expand Up @@ -920,7 +926,7 @@ TermDocs* MultiTermPositions::termDocs(IndexReader* reader) {
// rather merely producing a SegmentTermDocs via the reader's termDocs
// method.

TermPositions* tp = reader->termPositions();
TermPositions* tp = reader->termPositions(io_ctx_);
TermDocs* ret = tp->__asTermDocs();

CND_CONDITION(ret != NULL,
Expand Down
12 changes: 8 additions & 4 deletions src/core/CLucene/index/SegmentReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -506,22 +506,26 @@ bool SegmentReader::isDeleted(const int32_t n) {
return ret;
}

TermDocs *SegmentReader::termDocs() {
TermDocs *SegmentReader::termDocs(const void* io_ctx) {
//Func - Returns an unpositioned TermDocs enumerator.
//Pre - true
//Post - An unpositioned TermDocs enumerator has been returned

ensureOpen();
return _CLNEW SegmentTermDocs(this);
auto* ret = _CLNEW SegmentTermDocs(this);
ret->setIoContext(io_ctx);
return ret;
}

TermPositions *SegmentReader::termPositions() {
TermPositions *SegmentReader::termPositions(const void* io_ctx) {
//Func - Returns an unpositioned TermPositions enumerator.
//Pre - true
//Post - An unpositioned TermPositions enumerator has been returned

ensureOpen();
return _CLNEW SegmentTermPositions(this);
auto* ret = _CLNEW SegmentTermPositions(this);
ret->setIoContext(io_ctx);
return ret;
}

int32_t SegmentReader::docFreq(const Term *t) {
Expand Down
14 changes: 13 additions & 1 deletion src/core/CLucene/index/SegmentTermDocs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ TermPositions *SegmentTermDocs::__asTermPositions() {
return NULL;
}

void SegmentTermDocs::setIoContext(const void* io_ctx) {
if (parent && parent->tis) {
parent->tis->setIoContext(io_ctx);
}
if (freqStream) {
freqStream->setIoContext(io_ctx);
}
io_ctx_ = io_ctx;
}

int32_t SegmentTermDocs::docFreq() {
return df;
}
Expand Down Expand Up @@ -159,8 +169,10 @@ bool SegmentTermDocs::skipTo(const int32_t target) {
assert(count <= df);

if (df >= skipInterval) {// optimized case
if (skipListReader == NULL)
if (skipListReader == NULL) {
skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(), maxSkipLevels, skipInterval);// lazily clone
skipListReader->setIoContext(io_ctx_);
}

if (!haveSkipped) {// lazily initialize skip stream
skipListReader->init(skipPointer, freqBasePointer, proxBasePointer, df, hasProx, currentFieldStoresPayloads);
Expand Down
6 changes: 6 additions & 0 deletions src/core/CLucene/index/SegmentTermEnum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -420,4 +420,10 @@ void SegmentTermEnum::growBuffer(const uint32_t length, bool force_copy) {
}
}

void SegmentTermEnum::setIoContext(const void* io_ctx) {
if (input) {
input->setIoContext(io_ctx);
}
}

CL_NS_END
5 changes: 5 additions & 0 deletions src/core/CLucene/index/SegmentTermPositions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ SegmentTermPositions::~SegmentTermPositions() {
close();
}

void SegmentTermPositions::setIoContext(const void* io_ctx) {
SegmentTermDocs::setIoContext(io_ctx);
}

TermDocs* SegmentTermPositions::__asTermDocs(){
return (TermDocs*) this;
}
Expand Down Expand Up @@ -135,6 +139,7 @@ void SegmentTermPositions::lazySkip() {
if (proxStream == NULL) {
// clone lazily
proxStream = parent->proxStream->clone();
proxStream->setIoContext(io_ctx_);
buffer_.reset(proxStream);
}

Expand Down
6 changes: 6 additions & 0 deletions src/core/CLucene/index/SkipListReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ int32_t MultiLevelSkipListReader::skipTo(const int32_t target) {
return numSkipped[0] - skipInterval[0] - 1;
}

void MultiLevelSkipListReader::setIoContext(const void* io_ctx) {
if (skipStream[0]) {
skipStream[0]->setIoContext(io_ctx);
}
}

bool MultiLevelSkipListReader::loadNextSkip(const int32_t level) {
// we have to skip, the target document is greater than the current
// skip list entry
Expand Down
7 changes: 7 additions & 0 deletions src/core/CLucene/index/TermInfosReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,9 @@ TermInfo* TermInfosReader::get(const Term* term) {

// optimize sequential access: first try scanning cached enum w/o seeking
SegmentTermEnum* enumerator = getEnum();
if (enumerator) {
enumerator->setIoContext(io_ctx_);
}

// optimize sequential access: first try scanning cached enumerator w/o seeking
if (
Expand Down Expand Up @@ -265,6 +268,10 @@ TermInfo* TermInfosReader::get(const Term* term) {
return scanEnum(term);
}

void TermInfosReader::setIoContext(const void* io_ctx) {
io_ctx_ = io_ctx;
}

int64_t TermInfosReader::getPosition(const Term* term) {
//Func - Returns the position of a Term in the set
//Pre - term holds a valid reference to a Term
Expand Down
2 changes: 2 additions & 0 deletions src/core/CLucene/index/Terms.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ class CLUCENE_EXPORT TermDocs {
*/
virtual TermPositions* __asTermPositions()=0;

virtual void setIoContext(const void*) {}

virtual int32_t docFreq() {
_CLTHROWA(CL_ERR_UnsupportedOperation, "TermDocs::docFreq does not support this method.");
}
Expand Down
9 changes: 7 additions & 2 deletions src/core/CLucene/index/_MultiSegmentReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ class MultiSegmentReader:public DirectoryIndexReader{

//Returns the document frequency of the current term in the set
int32_t docFreq(const Term* t=NULL);
TermDocs* termDocs();
TermPositions* termPositions();
TermDocs* termDocs(const void* io_ctx = nullptr);
TermPositions* termPositions(const void* io_ctx = nullptr);

void getFieldNames (FieldOption fldOption, StringArrayWithDeletor& retarray);
static void getFieldNames(FieldOption fldOption, StringArrayWithDeletor& retarray, CL_NS(util)::ArrayBase<IndexReader*>* subReaders);
Expand Down Expand Up @@ -173,6 +173,11 @@ class MultiTermDocs:public virtual TermDocs {
virtual TermPositions* __asTermPositions();

int32_t docFreq() override;

void setIoContext(const void* io_ctx) override;

protected:
const void* io_ctx_ = nullptr;
};


Expand Down
11 changes: 8 additions & 3 deletions src/core/CLucene/index/_SegmentHeader.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ class SegmentTermDocs:public virtual TermDocs {
protected:
bool currentFieldStoresPayloads;
bool hasProx = false;
IndexVersion indexVersion_ = IndexVersion::kV0;
IndexVersion indexVersion_ = IndexVersion::kV0;
const void* io_ctx_ = nullptr;

public:
///\param Parent must be a segment reader
Expand All @@ -197,6 +198,8 @@ class SegmentTermDocs:public virtual TermDocs {

virtual TermPositions* __asTermPositions();

void setIoContext(const void* io_ctx) override;

int32_t docFreq() override;

protected:
Expand Down Expand Up @@ -234,6 +237,8 @@ class SegmentTermPositions: public SegmentTermDocs, public TermPositions {
SegmentTermPositions(const SegmentReader* Parent);
virtual ~SegmentTermPositions();

void setIoContext(const void* io_ctx) override;

private:
void seek(const TermInfo* ti, Term* term);

Expand Down Expand Up @@ -473,9 +478,9 @@ class SegmentReader: public DirectoryIndexReader {
bool isDeleted(const int32_t n);

///Returns an unpositioned TermDocs enumerator.
TermDocs* termDocs();
TermDocs* termDocs(const void* io_ctx = nullptr);
///Returns an unpositioned TermPositions enumerator.
TermPositions* termPositions();
TermPositions* termPositions(const void* io_ctx = nullptr);

///Returns the number of documents which contain the term t
int32_t docFreq(const Term* t);
Expand Down
2 changes: 2 additions & 0 deletions src/core/CLucene/index/_SegmentTermEnum.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ class SegmentTermEnum:public TermEnum{

int32_t getFormat() { return format; }

void setIoContext(const void* io_ctx);

private:
/**
* Reads the next term in the enumeration
Expand Down
2 changes: 2 additions & 0 deletions src/core/CLucene/index/_SkipListReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ class MultiLevelSkipListReader : LUCENE_BASE {
*/
int32_t skipTo(const int32_t target);

void setIoContext(const void* io_ctx);

private:
bool loadNextSkip(const int32_t level);

Expand Down
5 changes: 5 additions & 0 deletions src/core/CLucene/index/_TermInfosReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ CL_NS_DEF(index)

int64_t numBytesUsed;

const void* io_ctx_ = nullptr;

DEFINE_MUTEX(THIS_LOCK)

public:
Expand Down Expand Up @@ -108,6 +110,9 @@ CL_NS_DEF(index)
int64_t getRAMUsed() const {
return numBytesUsed;
}

void setIoContext(const void* io_ctx = nullptr);

private:
/** Reads the term info index file or .tti file. */
void ensureIndexIsRead();
Expand Down
3 changes: 3 additions & 0 deletions src/core/CLucene/store/IndexInput.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ CL_NS_DEF(store)

virtual void setIdxFileCache(bool index) {}

virtual void setIoContext(const void*) {}
virtual const void* getIoContext() {}

};

/** Abstract base class for input from a file in a {@link Directory}. A
Expand Down
Loading