Skip to content

Commit

Permalink
statistics: use histogram buckets bounds to enhance string match func…
Browse files Browse the repository at this point in the history
…tions estimation (#40338)

ref #36209
  • Loading branch information
time-and-fate authored Feb 28, 2023
1 parent 1da60a3 commit 96e345d
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 97 deletions.
4 changes: 2 additions & 2 deletions statistics/selectivity.go
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ OUTER:
// Try to cover remaining string matching functions by evaluating the expressions with TopN to estimate.
if ctx.GetSessionVars().EnableEvalTopNEstimationForStrMatch() {
for i, scalarCond := range notCoveredStrMatch {
ok, sel, err := coll.GetSelectivityByFilter(ctx, ctx.GetSessionVars().GetStrMatchDefaultSelectivity(), []expression.Expression{scalarCond})
ok, sel, err := coll.GetSelectivityByFilter(ctx, []expression.Expression{scalarCond})
if err != nil {
sc.AppendWarning(errors.New("Error when using TopN-assisted estimation: " + err.Error()))
}
Expand All @@ -454,7 +454,7 @@ OUTER:
delete(notCoveredStrMatch, i)
}
for i, scalarCond := range notCoveredNegateStrMatch {
ok, sel, err := coll.GetSelectivityByFilter(ctx, ctx.GetSessionVars().GetNegateStrMatchDefaultSelectivity(), []expression.Expression{scalarCond})
ok, sel, err := coll.GetSelectivityByFilter(ctx, []expression.Expression{scalarCond})
if err != nil {
sc.AppendWarning(errors.New("Error when using TopN-assisted estimation: " + err.Error()))
}
Expand Down
62 changes: 45 additions & 17 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ func CETraceRange(sctx sessionctx.Context, tableID int64, colNames []string, ran

func (coll *HistColl) findAvailableStatsForCol(sctx sessionctx.Context, uniqueID int64) (isIndex bool, idx int64) {
// try to find available stats in column stats
if colStats, ok := coll.Columns[uniqueID]; ok && colStats != nil && !colStats.IsInvalid(sctx, coll.Pseudo) {
if colStats, ok := coll.Columns[uniqueID]; ok && colStats != nil && !colStats.IsInvalid(sctx, coll.Pseudo) && colStats.IsFullLoad() {
return false, uniqueID
}
// try to find available stats in single column index stats (except for prefix index)
Expand All @@ -687,20 +687,18 @@ func (coll *HistColl) findAvailableStatsForCol(sctx sessionctx.Context, uniqueID
idxStats, ok := coll.Indices[idxStatsIdx]
if ok &&
idxStats.Info.Columns[0].Length == types.UnspecifiedLength &&
!idxStats.IsInvalid(coll.Pseudo) {
!idxStats.IsInvalid(coll.Pseudo) &&
idxStats.IsFullLoad() {
return true, idxStatsIdx
}
}
}
return false, -1
}

// GetSelectivityByFilter try to estimate selectivity of expressions by evaluate the expressions using TopN and NULL.
// The data represented by the Histogram would use the defaultSelectivity parameter as the selectivity.
// GetSelectivityByFilter try to estimate selectivity of expressions by evaluate the expressions using TopN, Histogram buckets boundaries and NULL.
// Currently, this method can only handle expressions involving a single column.
func (coll *HistColl) GetSelectivityByFilter(sctx sessionctx.Context,
defaultSelectivity float64,
filters []expression.Expression) (ok bool, selectivity float64, err error) {
func (coll *HistColl) GetSelectivityByFilter(sctx sessionctx.Context, filters []expression.Expression) (ok bool, selectivity float64, err error) {
// 1. Make sure the expressions
// (1) are safe to be evaluated here,
// (2) involve only one column,
Expand Down Expand Up @@ -766,12 +764,13 @@ func (coll *HistColl) GetSelectivityByFilter(sctx sessionctx.Context,
// Restore the original Index to avoid unexpected situation.
col.Index = originalIndex
}()
size := 1
topNLen := 0
histBucketsLen := hist.Len()
if topn != nil {
size = len(topn.TopN)
topNLen = len(topn.TopN)
}
c := chunk.NewChunkWithCapacity([]*types.FieldType{tp}, size)
selected := make([]bool, 0, size)
c := chunk.NewChunkWithCapacity([]*types.FieldType{tp}, mathutil.Max(1, topNLen))
selected := make([]bool, 0, mathutil.Max(histBucketsLen, topNLen))

// 3. Calculate the TopN part selectivity.
// This stage is considered as the core functionality of this method, errors in this stage would make this entire method fail.
Expand All @@ -797,20 +796,49 @@ func (coll *HistColl) GetSelectivityByFilter(sctx sessionctx.Context,
topNSel = float64(topNSelectedCnt) / totalCnt

// 4. Calculate the Histogram part selectivity.
histSel = defaultSelectivity * histTotalCnt / totalCnt
// The buckets upper bounds and the Bucket.Repeat are used like the TopN above.
// The buckets lower bounds are used as random samples and are regarded equally.
if hist != nil && histTotalCnt > 0 {
selected = selected[:0]
selected, err = expression.VectorizedFilter(sctx, filters, chunk.NewIterator4Chunk(hist.Bounds), selected)
if err != nil {
return false, 0, err
}
var bucketRepeatTotalCnt, bucketRepeatSelectedCnt, lowerBoundMatchCnt int64
for i := range hist.Buckets {
bucketRepeatTotalCnt += hist.Buckets[i].Repeat
if len(selected) < 2*i {
// This should not happen, but we add this check for safety.
break
}
if selected[2*i] {
lowerBoundMatchCnt++
}
if selected[2*i+1] {
bucketRepeatSelectedCnt += hist.Buckets[i].Repeat
}
}
var lowerBoundsRatio, upperBoundsRatio, lowerBoundsSel, upperBoundsSel float64
upperBoundsRatio = mathutil.Min(float64(bucketRepeatTotalCnt)/histTotalCnt, 1)
lowerBoundsRatio = 1 - upperBoundsRatio
if bucketRepeatTotalCnt > 0 {
upperBoundsSel = float64(bucketRepeatSelectedCnt) / float64(bucketRepeatTotalCnt)
}
lowerBoundsSel = float64(lowerBoundMatchCnt) / float64(histBucketsLen)
histSel = lowerBoundsSel*lowerBoundsRatio + upperBoundsSel*upperBoundsRatio
histSel *= histTotalCnt / totalCnt
}

// 5. Calculate the NULL part selectivity.
// Errors in this staged would be returned, but would not make this entire method fail.
c.Reset()
c.AppendNull(0)
selected = selected[:0]
selected, err = expression.VectorizedFilter(sctx, filters, chunk.NewIterator4Chunk(c), selected)
if err != nil || len(selected) != 1 {
nullSel = defaultSelectivity * float64(nullCnt) / totalCnt
} else if selected[0] {
nullSel = float64(nullCnt) / totalCnt
} else {
if err != nil || len(selected) != 1 || !selected[0] {
nullSel = 0
} else {
nullSel = float64(nullCnt) / totalCnt
}

// 6. Get the final result.
Expand Down
Loading

0 comments on commit 96e345d

Please sign in to comment.