Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A new stat for bytes read off the disk #1702

Merged
merged 17 commits into from
Jul 13, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions index/scorch/merge.go
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
fileMergeZapStartTime := time.Now()

atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1)
prevBytesReadTotal := cumulateBytesRead(segmentsToMerge)
newDocNums, _, err := s.segPlugin.Merge(segmentsToMerge, docsToDrop, path,
cw.cancelCh, s)
atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1)
Expand All @@ -352,6 +353,14 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
return err
}

switch segI := seg.(type) {
case segment.BytesOffDiskStats:
totalBytesRead := segI.BytesRead() + prevBytesReadTotal
segI.SetBytesRead(totalBytesRead)
seg = segI.(segment.Segment)
}

oldNewDocNums = make(map[uint64][]uint64)
for i, segNewDocNums := range newDocNums {
oldNewDocNums[task.Segments[i].Id()] = segNewDocNums
Expand Down Expand Up @@ -426,6 +435,16 @@ type segmentMerge struct {
notifyCh chan *mergeTaskIntroStatus
}

func cumulateBytesRead(sbs []segment.Segment) uint64 {
rv := uint64(0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

var rv uint64?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

for _, seg := range sbs {
if segI, ok := seg.(segment.BytesOffDiskStats); ok {
rv += segI.BytesRead()
}
}
return rv
}

// perform a merging of the given SegmentBase instances into a new,
// persisted segment, and synchronously introduce that new segment
// into the root
Expand Down
20 changes: 20 additions & 0 deletions index/scorch/scorch.go
Original file line number Diff line number Diff line change
Expand Up @@ -387,13 +387,23 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
analysisResults := make([]index.Document, int(numUpdates))
var itemsDeQueued uint64
var totalAnalysisSize int
analysisBytes := func(tokMap index.TokenFrequencies) (rv uint64) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a flag here to calculate this only when the flag is set?

Feel like this is something we don't always necessarily need to do - hope none of this calculation shows up in CPU profiles?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is enough, we've to record copies - stored values and doc values. Also term vectors. I don't see you doing this in your zap PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to close the thread, these two changes are going to be included in upcoming PRs.

for k := range tokMap {
rv += uint64(len(k))
}
return rv
}
for itemsDeQueued < numUpdates {
result := <-resultChan
resultSize := result.Size()
atomic.AddUint64(&s.iStats.analysisBytesAdded, uint64(resultSize))
totalAnalysisSize += resultSize
analysisResults[itemsDeQueued] = result
itemsDeQueued++
result.VisitFields(func(f index.Field) {
atomic.AddUint64(&s.stats.TotIndexedAnalysisBytes,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better stats wordings

TotBytesIndexedAfterAnalysis?
TotBytesReadDuringQueryTime Or TotBytesReadAtQueryTime

so that both stats share the same prefix.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

analysisBytes(f.AnalyzedTokenFrequencies()))
})
}
close(resultChan)
defer atomic.AddUint64(&s.iStats.analysisBytesRemoved, uint64(totalAnalysisSize))
Expand Down Expand Up @@ -525,6 +535,14 @@ func (s *Scorch) Stats() json.Marshaler {
return &s.stats
}

func (s *Scorch) BytesRead() uint64 {
Thejas-bhat marked this conversation as resolved.
Show resolved Hide resolved
return s.stats.TotBytesReadQueryTime
}

func (s *Scorch) SetBytesRead(val uint64) {
atomic.StoreUint64(&s.stats.TotBytesReadQueryTime, val)
}

func (s *Scorch) diskFileStats(rootSegmentPaths map[string]struct{}) (uint64,
uint64, uint64) {
var numFilesOnDisk, numBytesUsedDisk, numBytesOnDiskByRoot uint64
Expand Down Expand Up @@ -582,7 +600,9 @@ func (s *Scorch) StatsMap() map[string]interface{} {
m["index_time"] = m["TotIndexTime"]
m["term_searchers_started"] = m["TotTermSearchersStarted"]
m["term_searchers_finished"] = m["TotTermSearchersFinished"]
m["num_bytes_read_query_time"] = m["TotBytesReadQueryTime"]
m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"]
m["num_analysis_bytes_indexed"] = m["TotIndexedAnalysisBytes"]
Thejas-bhat marked this conversation as resolved.
Show resolved Hide resolved
m["num_items_introduced"] = m["TotIntroducedItems"]
m["num_items_persisted"] = m["TotPersistedItems"]
m["num_recs_to_persist"] = m["TotItemsToPersist"]
Expand Down
78 changes: 67 additions & 11 deletions index/scorch/snapshot_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ var reflectStaticSizeIndexSnapshot int
// in the kvConfig.
var DefaultFieldTFRCacheThreshold uint64 = 10

type bytesOffDiskStats segment.BytesOffDiskStats

func init() {
var is interface{} = IndexSnapshot{}
reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size())
Expand Down Expand Up @@ -146,10 +148,18 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string,
results := make(chan *asynchSegmentResult)
for index, segment := range i.segment {
go func(index int, segment *SegmentSnapshot) {
var prevBytesRead uint64
if seg, ok := segment.segment.(bytesOffDiskStats); ok {
prevBytesRead = seg.BytesRead()
}
dict, err := segment.segment.Dictionary(field)
if err != nil {
results <- &asynchSegmentResult{err: err}
} else {
if seg, ok := segment.segment.(bytesOffDiskStats); ok {
atomic.AddUint64(&i.parent.stats.TotBytesReadQueryTime,
seg.BytesRead()-prevBytesRead)
}
if randomLookup {
results <- &asynchSegmentResult{dict: dict}
} else {
Expand Down Expand Up @@ -424,6 +434,10 @@ func (i *IndexSnapshot) Document(id string) (rv index.Document, err error) {
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)

rvd := document.NewDocument(id)
var prevBytesRead uint64
if seg, ok := i.segment[segmentIndex].segment.(segment.BytesOffDiskStats); ok {
prevBytesRead = seg.BytesRead()
}
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, val []byte, pos []uint64) bool {
if name == "_id" {
return true
Expand Down Expand Up @@ -453,7 +467,10 @@ func (i *IndexSnapshot) Document(id string) (rv index.Document, err error) {
if err != nil {
return nil, err
}

if seg, ok := i.segment[segmentIndex].segment.(segment.BytesOffDiskStats); ok {
delta := seg.BytesRead() - prevBytesRead
atomic.AddUint64(&i.parent.stats.TotBytesReadQueryTime, delta)
}
return rvd, nil
}

Expand Down Expand Up @@ -505,46 +522,78 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err
return next.ID, nil
}

func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
func (is *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
rv := i.allocTermFieldReaderDicts(field)
rv := is.allocTermFieldReaderDicts(field)

rv.term = term
rv.field = field
rv.snapshot = i
rv.snapshot = is
if rv.postings == nil {
rv.postings = make([]segment.PostingsList, len(i.segment))
rv.postings = make([]segment.PostingsList, len(is.segment))
}
if rv.iterators == nil {
rv.iterators = make([]segment.PostingsIterator, len(i.segment))
rv.iterators = make([]segment.PostingsIterator, len(is.segment))
}
rv.segmentOffset = 0
rv.includeFreq = includeFreq
rv.includeNorm = includeNorm
rv.includeTermVectors = includeTermVectors
rv.currPosting = nil
rv.currID = rv.currID[:0]

if rv.dicts == nil {
rv.dicts = make([]segment.TermDictionary, len(i.segment))
for i, segment := range i.segment {
rv.dicts = make([]segment.TermDictionary, len(is.segment))
for i, segment := range is.segment {
var prevBytesRead uint64
if segP, ok := segment.segment.(bytesOffDiskStats); ok {
Thejas-bhat marked this conversation as resolved.
Show resolved Hide resolved
prevBytesRead = segP.BytesRead()
}
dict, err := segment.segment.Dictionary(field)
if err != nil {
return nil, err
}
if segP, ok := segment.segment.(bytesOffDiskStats); ok {
Thejas-bhat marked this conversation as resolved.
Show resolved Hide resolved
atomic.AddUint64(&is.parent.stats.TotBytesReadQueryTime, segP.BytesRead()-prevBytesRead)
}
rv.dicts[i] = dict
}
}

for i, segment := range i.segment {
for i, segment := range is.segment {
var prevBytesReadPL uint64
if _, ok := segment.segment.(bytesOffDiskStats); ok {
Thejas-bhat marked this conversation as resolved.
Show resolved Hide resolved
if postings, ok := rv.postings[i].(bytesOffDiskStats); ok {
prevBytesReadPL = postings.BytesRead()
}
}
pl, err := rv.dicts[i].PostingsList(term, segment.deleted, rv.postings[i])
if err != nil {
return nil, err
}
rv.postings[i] = pl
var prevBytesReadItr uint64
if _, ok := segment.segment.(bytesOffDiskStats); ok {
Thejas-bhat marked this conversation as resolved.
Show resolved Hide resolved
if itr, ok := rv.iterators[i].(bytesOffDiskStats); ok {
prevBytesReadItr = itr.BytesRead()
}
}
rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors, rv.iterators[i])

if _, ok := segment.segment.(bytesOffDiskStats); ok {
if postings, ok := pl.(bytesOffDiskStats); ok &&
prevBytesReadPL < postings.BytesRead() {
atomic.AddUint64(&is.parent.stats.TotBytesReadQueryTime,
postings.BytesRead()-prevBytesReadPL)
}

if itr, ok := rv.iterators[i].(bytesOffDiskStats); ok &&
prevBytesReadItr < itr.BytesRead() {
atomic.AddUint64(&is.parent.stats.TotBytesReadQueryTime,
itr.BytesRead()-prevBytesReadItr)
}
}
}
atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1))
atomic.AddUint64(&is.parent.stats.TotTermSearchersStarted, uint64(1))
return rv, nil
}

Expand Down Expand Up @@ -661,10 +710,17 @@ func (i *IndexSnapshot) documentVisitFieldTermsOnSegment(
}

if ssvOk && ssv != nil && len(vFields) > 0 {
var prevBytesRead uint64
if ssvp, ok := ssv.(segment.BytesOffDiskStats); ok {
prevBytesRead = ssvp.BytesRead()
}
dvs, err = ssv.VisitDocValues(localDocNum, fields, visitor, dvs)
if err != nil {
return nil, nil, err
}
if ssvp, ok := ssv.(segment.BytesOffDiskStats); ok {
atomic.AddUint64(&i.parent.stats.TotBytesReadQueryTime, ssvp.BytesRead()-prevBytesRead)
}
}

if errCh != nil {
Expand Down
13 changes: 13 additions & 0 deletions index/scorch/snapshot_index_tfr.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in
}
// find the next hit
for i.segmentOffset < len(i.iterators) {
prevBytesRead := uint64(0)
if itr, ok := i.iterators[i.segmentOffset].(segment.BytesOffDiskStats); ok {
prevBytesRead = itr.BytesRead()
}
next, err := i.iterators[i.segmentOffset].Next()
if err != nil {
return nil, err
Expand All @@ -89,6 +93,15 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in

i.currID = rv.ID
i.currPosting = next
// postingsIterators is maintain the bytesRead stat in a cumulative fashion.
// this is because there are chances of having a series of loadChunk calls,
// and they have to be added together before sending the bytesRead at this point
// upstream.
if itr, ok := i.iterators[i.segmentOffset].(segment.BytesOffDiskStats); ok {
delta := itr.BytesRead() - prevBytesRead
atomic.AddUint64(&i.snapshot.parent.stats.TotBytesReadQueryTime, uint64(delta))
}

return rv, nil
}
i.segmentOffset++
Expand Down
2 changes: 2 additions & 0 deletions index/scorch/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ type Stats struct {
TotAnalysisTime uint64
TotIndexTime uint64

TotBytesReadQueryTime uint64
TotIndexedPlainTextBytes uint64
TotIndexedAnalysisBytes uint64

TotTermSearchersStarted uint64
TotTermSearchersFinished uint64
Expand Down
Loading