Skip to content

Commit

Permalink
statistics: relax the check of the OutOfRange (#24958)
Browse files Browse the repository at this point in the history
  • Loading branch information
winoros authored Jun 2, 2021
1 parent da8dbe6 commit 32cf14b
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 70 deletions.
8 changes: 8 additions & 0 deletions statistics/cmsketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,14 @@ func (c *TopN) Num() int {
return len(c.TopN)
}

// outOfRange checks whether the the given value falls back in [TopN.LowestOne, TopN.HighestOne].
func (c *TopN) outOfRange(val []byte) bool {
if c == nil || len(c.TopN) == 0 {
return true
}
return bytes.Compare(c.TopN[0].Encoded, val) > 0 || bytes.Compare(val, c.TopN[c.Num()-1].Encoded) > 0
}

// DecodedString returns the value with decoded result.
func (c *TopN) DecodedString(ctx sessionctx.Context, colTypes []byte) (string, error) {
builder := &strings.Builder{}
Expand Down
92 changes: 48 additions & 44 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -506,20 +506,12 @@ func (hg *Histogram) BetweenRowCount(a, b types.Datum) float64 {
}

// BetweenRowCount estimates the row count for interval [l, r).
func (c *Column) BetweenRowCount(sc *stmtctx.StatementContext, l, r types.Datum) (float64, error) {
func (c *Column) BetweenRowCount(sc *stmtctx.StatementContext, l, r types.Datum, lowEncoded, highEncoded []byte) float64 {
histBetweenCnt := c.Histogram.BetweenRowCount(l, r)
if c.StatsVer <= Version1 {
return histBetweenCnt, nil
}
lBytes, err := codec.EncodeKey(sc, nil, l)
if err != nil {
return 0, errors.Trace(err)
}
rBytes, err := codec.EncodeKey(sc, nil, r)
if err != nil {
return 0, errors.Trace(err)
return histBetweenCnt
}
return float64(c.TopN.BetweenCount(lBytes, rBytes)) + histBetweenCnt, nil
return float64(c.TopN.BetweenCount(lowEncoded, highEncoded)) + histBetweenCnt
}

// TotalRowCount returns the total count of this histogram.
Expand Down Expand Up @@ -978,7 +970,7 @@ func (c *Column) IsInvalid(sc *stmtctx.StatementContext, collPseudo bool) bool {
return c.TotalRowCount() == 0 || (c.Histogram.NDV > 0 && c.notNullCount() == 0)
}

func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, modifyCount int64) (float64, error) {
func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, encodedVal []byte, modifyCount int64) (float64, error) {
if val.IsNull() {
return float64(c.NullCount), nil
}
Expand All @@ -987,7 +979,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
if c.Histogram.Bounds.NumRows() == 0 {
return 0.0, nil
}
if c.Histogram.NDV > 0 && c.outOfRange(val) {
if c.Histogram.NDV > 0 && c.outOfRange(val, encodedVal) {
return outOfRangeEQSelectivity(c.Histogram.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
}
if c.CMSketch != nil {
Expand All @@ -996,14 +988,17 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
}
return c.Histogram.equalRowCount(val, false), nil
}
// All the values are null.
if c.Histogram.Bounds.NumRows() == 0 && c.TopN.Num() == 0 {
return 0, nil
}
if c.Histogram.NDV+int64(c.TopN.Num()) > 0 && c.outOfRange(val, encodedVal) {
return outOfRangeEQSelectivity(c.Histogram.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
}
// Stats version == 2
// 1. try to find this value in TopN
if c.TopN != nil {
valBytes, err := codec.EncodeKey(sc, nil, val)
if err != nil {
return 0, errors.Trace(err)
}
rowcount, ok := c.QueryTopN(valBytes)
rowcount, ok := c.QueryTopN(encodedVal)
if ok {
return float64(rowcount), nil
}
Expand Down Expand Up @@ -1054,6 +1049,14 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
if err != nil {
return 0, errors.Trace(err)
}
lowEncoded, err := codec.EncodeKey(sc, nil, lowVal)
if err != nil {
return 0, err
}
highEncoded, err := codec.EncodeKey(sc, nil, highVal)
if err != nil {
return 0, err
}
if cmp == 0 {
// the point case.
if !rg.LowExclude && !rg.HighExclude {
Expand All @@ -1063,7 +1066,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
continue
}
var cnt float64
cnt, err = c.equalRowCount(sc, lowVal, modifyCount)
cnt, err = c.equalRowCount(sc, lowVal, lowEncoded, modifyCount)
if err != nil {
return 0, errors.Trace(err)
}
Expand All @@ -1075,7 +1078,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
// The small range case.
if rangeVals != nil {
for _, val := range rangeVals {
cnt, err := c.equalRowCount(sc, val, modifyCount)
cnt, err := c.equalRowCount(sc, val, lowEncoded, modifyCount)
if err != nil {
return 0, err
}
Expand All @@ -1084,18 +1087,15 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
continue
}
// The interval case.
cnt, err := c.BetweenRowCount(sc, lowVal, highVal)
if err != nil {
return 0, err
}
if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) {
cnt := c.BetweenRowCount(sc, lowVal, highVal, lowEncoded, highEncoded)
if (c.outOfRange(lowVal, lowEncoded) && !lowVal.IsNull()) || c.outOfRange(highVal, highEncoded) {
cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount()
}
// `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here.
// Note that, `cnt` does not include null values, we need specially handle cases
// where null is the lower bound.
if rg.LowExclude && !lowVal.IsNull() {
lowCnt, err := c.equalRowCount(sc, lowVal, modifyCount)
lowCnt, err := c.equalRowCount(sc, lowVal, lowEncoded, modifyCount)
if err != nil {
return 0, errors.Trace(err)
}
Expand All @@ -1105,7 +1105,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
cnt += float64(c.NullCount)
}
if !rg.HighExclude {
highCnt, err := c.equalRowCount(sc, highVal, modifyCount)
highCnt, err := c.equalRowCount(sc, highVal, highEncoded, modifyCount)
if err != nil {
return 0, errors.Trace(err)
}
Expand All @@ -1121,6 +1121,15 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
return rowCount, nil
}

func (c *Column) outOfRange(val types.Datum, encodedVal []byte) bool {
outOfHist := c.Histogram.outOfRange(val)
if !outOfHist {
return false
}
// Already out of hist.
return c.TopN.outOfRange(encodedVal)
}

// Index represents an index histogram.
type Index struct {
Histogram
Expand Down Expand Up @@ -1504,26 +1513,21 @@ func (coll *HistColl) NewHistCollBySelectivity(sc *stmtctx.StatementContext, sta
}

func (idx *Index) outOfRange(val types.Datum) bool {
histEmpty, topNEmpty := idx.Histogram.Len() == 0, idx.TopN.Num() == 0
// All empty.
if histEmpty && topNEmpty {
return true
}
// TopN is not empty. Record found.
if !topNEmpty && idx.TopN.findTopN(val.GetBytes()) >= 0 {
outOfTopN := idx.TopN.outOfRange(val.GetBytes())
// The val is in TopN, return false.
if !outOfTopN {
return false
}
if !histEmpty {
withInLowBoundOrPrefixMatch := chunk.Compare(idx.Bounds.GetRow(0), 0, &val) <= 0 ||
matchPrefix(idx.Bounds.GetRow(0), 0, &val)
withInHighBound := chunk.Compare(idx.Bounds.GetRow(idx.Bounds.NumRows()-1), 0, &val) >= 0
// Hist is not empty. Record found.
if withInLowBoundOrPrefixMatch && withInHighBound {
return false
}

histEmpty := idx.Histogram.Len() == 0
// HistEmpty->Hist out of range.
if histEmpty {
return true
}
// No record found. Is out of range.
return true
withInLowBoundOrPrefixMatch := chunk.Compare(idx.Bounds.GetRow(0), 0, &val) <= 0 ||
matchPrefix(idx.Bounds.GetRow(0), 0, &val)
withInHighBound := chunk.Compare(idx.Bounds.GetRow(idx.Bounds.NumRows()-1), 0, &val) >= 0
return !withInLowBoundOrPrefixMatch || !withInHighBound
}

// matchPrefix checks whether ad is the prefix of value
Expand Down
8 changes: 4 additions & 4 deletions statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -649,19 +649,19 @@ func (s *testStatsSuite) TestTopNOutOfHist(c *C) {

testKit.MustExec("drop table if exists topn_before_hist")
testKit.MustExec("create table topn_before_hist(a int, index idx(a))")
testKit.MustExec("insert into topn_before_hist values(1), (1), (1), (1), (2), (2), (3), (4), (5)")
testKit.MustExec("insert into topn_before_hist values(1), (1), (1), (1), (3), (3), (4), (5), (6)")
testKit.MustExec("analyze table topn_before_hist with 2 topn, 3 buckets")

testKit.MustExec("create table topn_after_hist(a int, index idx(a))")
testKit.MustExec("insert into topn_after_hist values(2), (2), (3), (4), (5), (6), (6), (6), (6)")
testKit.MustExec("insert into topn_after_hist values(2), (2), (3), (4), (5), (7), (7), (7), (7)")
testKit.MustExec("analyze table topn_after_hist with 2 topn, 3 buckets")

testKit.MustExec("create table topn_before_hist_no_index(a int)")
testKit.MustExec("insert into topn_before_hist_no_index values(1), (1), (1), (1), (2), (2), (3), (4), (5)")
testKit.MustExec("insert into topn_before_hist_no_index values(1), (1), (1), (1), (3), (3), (4), (5), (6)")
testKit.MustExec("analyze table topn_before_hist_no_index with 2 topn, 3 buckets")

testKit.MustExec("create table topn_after_hist_no_index(a int)")
testKit.MustExec("insert into topn_after_hist_no_index values(2), (2), (3), (4), (5), (6), (6), (6), (6)")
testKit.MustExec("insert into topn_after_hist_no_index values(2), (2), (3), (4), (5), (7), (7), (7), (7)")
testKit.MustExec("analyze table topn_after_hist_no_index with 2 topn, 3 buckets")

var (
Expand Down
2 changes: 1 addition & 1 deletion statistics/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ func (s *testStatisticsSuite) TestPseudoTable(c *C) {
count, err := tbl.ColumnEqualRowCount(sc, types.NewIntDatum(1000), colInfo.ID)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 10)
count = tbl.ColumnBetweenRowCount(sc, types.NewIntDatum(1000), types.NewIntDatum(5000), colInfo.ID)
count, _ = tbl.ColumnBetweenRowCount(sc, types.NewIntDatum(1000), types.NewIntDatum(5000), colInfo.ID)
c.Assert(int(count), Equals, 250)
}

Expand Down
21 changes: 15 additions & 6 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,19 +277,24 @@ func (t *Table) ColumnLessRowCount(sc *stmtctx.StatementContext, value types.Dat
}

// ColumnBetweenRowCount estimates the row count where column greater or equal to a and less than b.
func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.Datum, colID int64) float64 {
func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.Datum, colID int64) (float64, error) {
c, ok := t.Columns[colID]
if !ok || c.IsInvalid(sc, t.Pseudo) {
return float64(t.Count) / pseudoBetweenRate
return float64(t.Count) / pseudoBetweenRate, nil
}
count, err := c.BetweenRowCount(sc, a, b)
aEncoded, err := codec.EncodeKey(sc, nil, a)
if err != nil {
return 0
return 0, err
}
bEncoded, err := codec.EncodeKey(sc, nil, b)
if err != nil {
return 0, err
}
count := c.BetweenRowCount(sc, a, b, aEncoded, bEncoded)
if a.IsNull() {
count += float64(c.NullCount)
}
return count * c.GetIncreaseFactor(t.Count)
return count * c.GetIncreaseFactor(t.Count), nil
}

// ColumnEqualRowCount estimates the row count where the column equals to value.
Expand All @@ -298,7 +303,11 @@ func (t *Table) ColumnEqualRowCount(sc *stmtctx.StatementContext, value types.Da
if !ok || c.IsInvalid(sc, t.Pseudo) {
return float64(t.Count) / pseudoEqualRate, nil
}
result, err := c.equalRowCount(sc, value, t.ModifyCount)
encodedVal, err := codec.EncodeKey(sc, nil, value)
if err != nil {
return 0, err
}
result, err := c.equalRowCount(sc, value, encodedVal, t.ModifyCount)
result *= c.GetIncreaseFactor(t.Count)
return result, errors.Trace(err)
}
Expand Down
6 changes: 5 additions & 1 deletion statistics/testdata/stats_suite_in.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,13 @@
"show stats_topn",
"show stats_buckets",
"explain select * from topn_before_hist where a = 1",
"explain select * from topn_before_hist where a = 2",
"explain select * from topn_after_hist where a = 7",
"explain select * from topn_after_hist where a = 6",
"explain select * from topn_after_hist_no_index where a = 7",
"explain select * from topn_after_hist_no_index where a = 6",
"explain select * from topn_before_hist_no_index where a = 1"
"explain select * from topn_before_hist_no_index where a = 1",
"explain select * from topn_before_hist_no_index where a = 2"
]
},
{
Expand Down
46 changes: 32 additions & 14 deletions statistics/testdata/stats_suite_out.json
Original file line number Diff line number Diff line change
Expand Up @@ -427,49 +427,67 @@
"Cases": [
[
"test topn_before_hist a 0 1 4",
"test topn_before_hist a 0 2 2",
"test topn_before_hist a 0 3 2",
"test topn_before_hist idx 1 1 4",
"test topn_before_hist idx 1 2 2",
"test topn_before_hist idx 1 3 2",
"test topn_after_hist a 0 2 2",
"test topn_after_hist a 0 6 4",
"test topn_after_hist a 0 7 4",
"test topn_after_hist idx 1 2 2",
"test topn_after_hist idx 1 6 4",
"test topn_after_hist idx 1 7 4",
"test topn_before_hist_no_index a 0 1 4",
"test topn_before_hist_no_index a 0 2 2",
"test topn_before_hist_no_index a 0 3 2",
"test topn_after_hist_no_index a 0 2 2",
"test topn_after_hist_no_index a 0 6 4"
"test topn_after_hist_no_index a 0 7 4"
],
[
"test topn_before_hist a 0 0 2 1 3 4 0",
"test topn_before_hist a 0 1 3 1 5 5 0",
"test topn_before_hist idx 1 0 2 1 3 4 0",
"test topn_before_hist idx 1 1 3 1 5 5 0",
"test topn_before_hist a 0 0 2 1 4 5 0",
"test topn_before_hist a 0 1 3 1 6 6 0",
"test topn_before_hist idx 1 0 2 1 4 5 0",
"test topn_before_hist idx 1 1 3 1 6 6 0",
"test topn_after_hist a 0 0 2 1 3 4 0",
"test topn_after_hist a 0 1 3 1 5 5 0",
"test topn_after_hist idx 1 0 2 1 3 4 0",
"test topn_after_hist idx 1 1 3 1 5 5 0",
"test topn_before_hist_no_index a 0 0 2 1 3 4 0",
"test topn_before_hist_no_index a 0 1 3 1 5 5 0",
"test topn_before_hist_no_index a 0 0 2 1 4 5 0",
"test topn_before_hist_no_index a 0 1 3 1 6 6 0",
"test topn_after_hist_no_index a 0 0 2 1 3 4 0",
"test topn_after_hist_no_index a 0 1 3 1 5 5 0"
],
[
"IndexReader_6 4.00 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 4.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[1,1], keep order:false"
],
[
"IndexReader_6 0.00 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 0.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[2,2], keep order:false"
],
[
"IndexReader_6 4.00 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 4.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false"
"└─IndexRangeScan_5 4.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[7,7], keep order:false"
],
[
"IndexReader_6 0.00 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 0.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false"
],
[
"TableReader_7 4.00 root data:Selection_6",
"└─Selection_6 4.00 cop[tikv] eq(test.topn_after_hist_no_index.a, 6)",
"└─Selection_6 4.00 cop[tikv] eq(test.topn_after_hist_no_index.a, 7)",
" └─TableFullScan_5 9.00 cop[tikv] table:topn_after_hist_no_index keep order:false"
],
[
"TableReader_7 1.00 root data:Selection_6",
"└─Selection_6 1.00 cop[tikv] eq(test.topn_after_hist_no_index.a, 6)",
" └─TableFullScan_5 9.00 cop[tikv] table:topn_after_hist_no_index keep order:false"
],
[
"TableReader_7 4.00 root data:Selection_6",
"└─Selection_6 4.00 cop[tikv] eq(test.topn_before_hist_no_index.a, 1)",
" └─TableFullScan_5 9.00 cop[tikv] table:topn_before_hist_no_index keep order:false"
],
[
"TableReader_7 1.00 root data:Selection_6",
"└─Selection_6 1.00 cop[tikv] eq(test.topn_before_hist_no_index.a, 2)",
" └─TableFullScan_5 9.00 cop[tikv] table:topn_before_hist_no_index keep order:false"
]
]
},
Expand Down

0 comments on commit 32cf14b

Please sign in to comment.