Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats: do not split excluded lower value ranges (#12009) #12172

Merged
merged 2 commits into from
Sep 13, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions statistics/feedback.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,15 +309,21 @@ func buildBucketFeedback(h *Histogram, feedback *QueryFeedback) (map[int]*Bucket
if skip {
continue
}
idx, _ := h.Bounds.LowerBound(0, fb.lower)
idx := h.Bounds.UpperBound(0, fb.lower)
bktIdx := 0
// The last bucket also stores the feedback that falls outside the upper bound.
if idx >= h.Bounds.NumRows()-2 {
if idx >= h.Bounds.NumRows()-1 {
bktIdx = h.Len() - 1
} else if h.Len() == 1 {
bktIdx = 0
} else {
bktIdx = idx / 2
if idx == 0 {
bktIdx = 0
} else {
bktIdx = (idx - 1) / 2
}
// Make sure that this feedback lies within the bucket.
if chunk.Compare(h.Bounds.GetRow(2*bktIdx+1), 0, fb.upper) < 0 {
if chunk.Compare(h.Bounds.GetRow(2*(bktIdx+1)), 0, fb.upper) < 0 {
continue
}
}
Expand Down Expand Up @@ -954,11 +960,11 @@ func formatBuckets(hg *Histogram, lowBkt, highBkt, idxCols int) string {
return hg.bucketToString(lowBkt, idxCols)
}
if lowBkt+1 == highBkt {
return fmt.Sprintf("%s, %s", hg.bucketToString(lowBkt, 0), hg.bucketToString(highBkt, 0))
return fmt.Sprintf("%s, %s", hg.bucketToString(lowBkt, idxCols), hg.bucketToString(highBkt, idxCols))
}
// do not care the middle buckets
return fmt.Sprintf("%s, (%d buckets, total count %d), %s", hg.bucketToString(lowBkt, 0),
highBkt-lowBkt-1, hg.Buckets[highBkt-1].Count-hg.Buckets[lowBkt].Count, hg.bucketToString(highBkt, 0))
return fmt.Sprintf("%s, (%d buckets, total count %d), %s", hg.bucketToString(lowBkt, idxCols),
highBkt-lowBkt-1, hg.Buckets[highBkt-1].Count-hg.Buckets[lowBkt].Count, hg.bucketToString(highBkt, idxCols))
}

func colRangeToStr(c *Column, ran *ranger.Range, actual int64, factor float64) string {
Expand Down
15 changes: 7 additions & 8 deletions statistics/feedback_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,13 @@ func (s *testFeedbackSuite) TestUpdateHistogram(c *C) {
defaultBucketCount = 7
defer func() { defaultBucketCount = originBucketCount }()
c.Assert(UpdateHistogram(q.Hist(), q).ToString(0), Equals,
"column:0 ndv:10058 totColSize:0\n"+
"num: 10000 lower_bound: 0 upper_bound: 1 repeats: 0\n"+
"num: 9 lower_bound: 2 upper_bound: 7 repeats: 0\n"+
"num: 11 lower_bound: 8 upper_bound: 19 repeats: 0\n"+
"num: 0 lower_bound: 20 upper_bound: 20 repeats: 0\n"+
"num: 18 lower_bound: 21 upper_bound: 39 repeats: 0\n"+
"num: 18 lower_bound: 40 upper_bound: 58 repeats: 0\n"+
"num: 2 lower_bound: 59 upper_bound: 60 repeats: 0")
"column:0 ndv:10053 totColSize:0\n"+
"num: 10001 lower_bound: 0 upper_bound: 2 repeats: 0\n"+
"num: 7 lower_bound: 2 upper_bound: 5 repeats: 0\n"+
"num: 4 lower_bound: 5 upper_bound: 7 repeats: 0\n"+
"num: 11 lower_bound: 10 upper_bound: 20 repeats: 0\n"+
"num: 19 lower_bound: 30 upper_bound: 49 repeats: 0\n"+
"num: 11 lower_bound: 50 upper_bound: 60 repeats: 0")
}

func (s *testFeedbackSuite) TestSplitBuckets(c *C) {
Expand Down
57 changes: 31 additions & 26 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -559,41 +559,43 @@ func (hg *Histogram) typeMatch(ranges []*ranger.Range) bool {
return true
}

// SplitRange splits the range according to the histogram upper bound. Note that we treat last bucket's upper bound
// as inf, so all the split ranges will totally fall in one of the (-inf, u(0)], (u(0), u(1)],...(u(n-3), u(n-2)],
// (u(n-2), +inf), where n is the number of buckets, u(i) is the i-th bucket's upper bound.
// SplitRange splits the range according to the histogram lower bound. Note that we treat first bucket's lower bound
// as -inf and last bucket's upper bound as +inf, so all the split ranges will totally fall in one of the (-inf, l(1)),
// [l(1), l(2)),...[l(n-2), l(n-1)), [l(n-1), +inf), where n is the number of buckets, l(i) is the i-th bucket's lower bound.
func (hg *Histogram) SplitRange(sc *stmtctx.StatementContext, oldRanges []*ranger.Range, encoded bool) ([]*ranger.Range, bool) {
if !hg.typeMatch(oldRanges) {
return oldRanges, false
}
// Treat the only buckets as (-inf, +inf), so we do not need split it.
if hg.Len() == 1 {
return oldRanges, true
}
ranges := make([]*ranger.Range, 0, len(oldRanges))
for _, ran := range oldRanges {
ranges = append(ranges, ran.Clone())
}
split := make([]*ranger.Range, 0, len(ranges))
for len(ranges) > 0 {
// Find the last bound that greater or equal to the LowVal.
// Find the first bound that greater than the LowVal.
idx := hg.Bounds.UpperBound(0, &ranges[0].LowVal[0])
if !ranges[0].LowExclude && idx > 0 {
cmp := chunk.Compare(hg.Bounds.GetRow(idx-1), 0, &ranges[0].LowVal[0])
if cmp == 0 {
idx--
}
}
// Treat last bucket's upper bound as inf, so we do not need split any more.
if idx >= hg.Bounds.NumRows()-2 {
// Treat last bucket's upper bound as +inf, so we do not need split any more.
if idx >= hg.Bounds.NumRows()-1 {
split = append(split, ranges...)
break
}
// Get the corresponding upper bound.
if idx%2 == 0 {
// Treat first buckets's lower bound as -inf, just increase it to the next lower bound.
if idx == 0 {
idx = 2
}
// Get the next lower bound.
if idx%2 == 1 {
idx++
}
upperBound := hg.Bounds.GetRow(idx)
lowerBound := hg.Bounds.GetRow(idx)
var i int
// Find the first range that need to be split by the upper bound.
// Find the first range that need to be split by the lower bound.
for ; i < len(ranges); i++ {
if chunk.Compare(upperBound, 0, &ranges[i].HighVal[0]) < 0 {
if chunk.Compare(lowerBound, 0, &ranges[i].HighVal[0]) <= 0 {
break
}
}
Expand All @@ -602,17 +604,20 @@ func (hg *Histogram) SplitRange(sc *stmtctx.StatementContext, oldRanges []*range
if len(ranges) == 0 {
break
}
// Split according to the upper bound.
cmp := chunk.Compare(upperBound, 0, &ranges[0].LowVal[0])
if cmp > 0 || (cmp == 0 && !ranges[0].LowExclude) {
upper := upperBound.GetDatum(0, hg.tp)
split = append(split, &ranger.Range{
// Split according to the lower bound.
cmp := chunk.Compare(lowerBound, 0, &ranges[0].LowVal[0])
if cmp > 0 {
lower := lowerBound.GetDatum(0, hg.tp)
newRange := &ranger.Range{
LowExclude: ranges[0].LowExclude,
LowVal: []types.Datum{ranges[0].LowVal[0]},
HighVal: []types.Datum{upper},
HighExclude: false})
ranges[0].LowVal[0] = upper
ranges[0].LowExclude = true
HighVal: []types.Datum{lower},
HighExclude: true}
if validRange(sc, newRange, encoded) {
split = append(split, newRange)
}
ranges[0].LowVal[0] = lower
ranges[0].LowExclude = false
if !validRange(sc, ranges[0], encoded) {
ranges = ranges[1:]
}
Expand Down
Loading