Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

statistics: relax the check of the OutOfRange #24958

Merged
merged 3 commits into from
Jun 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions statistics/cmsketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,14 @@ func (c *TopN) Num() int {
return len(c.TopN)
}

// outOfRange checks whether the the given value falls back in [TopN.LowestOne, TopN.HighestOne].
func (c *TopN) outOfRange(val []byte) bool {
if c == nil || len(c.TopN) == 0 {
return true
}
return bytes.Compare(c.TopN[0].Encoded, val) > 0 || bytes.Compare(val, c.TopN[c.Num()-1].Encoded) > 0
}

// DecodedString returns the value with decoded result.
func (c *TopN) DecodedString(ctx sessionctx.Context, colTypes []byte) (string, error) {
builder := &strings.Builder{}
Expand Down
92 changes: 48 additions & 44 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -506,20 +506,12 @@ func (hg *Histogram) BetweenRowCount(a, b types.Datum) float64 {
}

// BetweenRowCount estimates the row count for interval [l, r).
func (c *Column) BetweenRowCount(sc *stmtctx.StatementContext, l, r types.Datum) (float64, error) {
func (c *Column) BetweenRowCount(sc *stmtctx.StatementContext, l, r types.Datum, lowEncoded, highEncoded []byte) float64 {
histBetweenCnt := c.Histogram.BetweenRowCount(l, r)
if c.StatsVer <= Version1 {
return histBetweenCnt, nil
}
lBytes, err := codec.EncodeKey(sc, nil, l)
if err != nil {
return 0, errors.Trace(err)
}
rBytes, err := codec.EncodeKey(sc, nil, r)
if err != nil {
return 0, errors.Trace(err)
return histBetweenCnt
}
return float64(c.TopN.BetweenCount(lBytes, rBytes)) + histBetweenCnt, nil
return float64(c.TopN.BetweenCount(lowEncoded, highEncoded)) + histBetweenCnt
}

// TotalRowCount returns the total count of this histogram.
Expand Down Expand Up @@ -978,7 +970,7 @@ func (c *Column) IsInvalid(sc *stmtctx.StatementContext, collPseudo bool) bool {
return c.TotalRowCount() == 0 || (c.Histogram.NDV > 0 && c.notNullCount() == 0)
}

func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, modifyCount int64) (float64, error) {
func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, encodedVal []byte, modifyCount int64) (float64, error) {
if val.IsNull() {
return float64(c.NullCount), nil
}
Expand All @@ -987,7 +979,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
if c.Histogram.Bounds.NumRows() == 0 {
return 0.0, nil
}
if c.Histogram.NDV > 0 && c.outOfRange(val) {
if c.Histogram.NDV > 0 && c.outOfRange(val, encodedVal) {
return outOfRangeEQSelectivity(c.Histogram.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
}
if c.CMSketch != nil {
Expand All @@ -996,14 +988,17 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
}
return c.Histogram.equalRowCount(val, false), nil
}
// All the values are null.
if c.Histogram.Bounds.NumRows() == 0 && c.TopN.Num() == 0 {
return 0, nil
}
if c.Histogram.NDV+int64(c.TopN.Num()) > 0 && c.outOfRange(val, encodedVal) {
return outOfRangeEQSelectivity(c.Histogram.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
}
// Stats version == 2
// 1. try to find this value in TopN
if c.TopN != nil {
valBytes, err := codec.EncodeKey(sc, nil, val)
if err != nil {
return 0, errors.Trace(err)
}
rowcount, ok := c.QueryTopN(valBytes)
rowcount, ok := c.QueryTopN(encodedVal)
if ok {
return float64(rowcount), nil
}
Expand Down Expand Up @@ -1054,6 +1049,14 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
if err != nil {
return 0, errors.Trace(err)
}
lowEncoded, err := codec.EncodeKey(sc, nil, lowVal)
if err != nil {
return 0, err
}
highEncoded, err := codec.EncodeKey(sc, nil, highVal)
if err != nil {
return 0, err
}
if cmp == 0 {
// the point case.
if !rg.LowExclude && !rg.HighExclude {
Expand All @@ -1063,7 +1066,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
continue
}
var cnt float64
cnt, err = c.equalRowCount(sc, lowVal, modifyCount)
cnt, err = c.equalRowCount(sc, lowVal, lowEncoded, modifyCount)
if err != nil {
return 0, errors.Trace(err)
}
Expand All @@ -1075,7 +1078,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
// The small range case.
if rangeVals != nil {
for _, val := range rangeVals {
cnt, err := c.equalRowCount(sc, val, modifyCount)
cnt, err := c.equalRowCount(sc, val, lowEncoded, modifyCount)
if err != nil {
return 0, err
}
Expand All @@ -1084,18 +1087,15 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
continue
}
// The interval case.
cnt, err := c.BetweenRowCount(sc, lowVal, highVal)
if err != nil {
return 0, err
}
if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) {
cnt := c.BetweenRowCount(sc, lowVal, highVal, lowEncoded, highEncoded)
if (c.outOfRange(lowVal, lowEncoded) && !lowVal.IsNull()) || c.outOfRange(highVal, highEncoded) {
cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount()
}
// `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here.
// Note that, `cnt` does not include null values, we need specially handle cases
// where null is the lower bound.
if rg.LowExclude && !lowVal.IsNull() {
lowCnt, err := c.equalRowCount(sc, lowVal, modifyCount)
lowCnt, err := c.equalRowCount(sc, lowVal, lowEncoded, modifyCount)
if err != nil {
return 0, errors.Trace(err)
}
Expand All @@ -1105,7 +1105,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
cnt += float64(c.NullCount)
}
if !rg.HighExclude {
highCnt, err := c.equalRowCount(sc, highVal, modifyCount)
highCnt, err := c.equalRowCount(sc, highVal, highEncoded, modifyCount)
if err != nil {
return 0, errors.Trace(err)
}
Expand All @@ -1121,6 +1121,15 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
return rowCount, nil
}

func (c *Column) outOfRange(val types.Datum, encodedVal []byte) bool {
outOfHist := c.Histogram.outOfRange(val)
if !outOfHist {
return false
}
// Already out of hist.
return c.TopN.outOfRange(encodedVal)
}

// Index represents an index histogram.
type Index struct {
Histogram
Expand Down Expand Up @@ -1504,26 +1513,21 @@ func (coll *HistColl) NewHistCollBySelectivity(sc *stmtctx.StatementContext, sta
}

func (idx *Index) outOfRange(val types.Datum) bool {
histEmpty, topNEmpty := idx.Histogram.Len() == 0, idx.TopN.Num() == 0
// All empty.
if histEmpty && topNEmpty {
return true
}
// TopN is not empty. Record found.
if !topNEmpty && idx.TopN.findTopN(val.GetBytes()) >= 0 {
outOfTopN := idx.TopN.outOfRange(val.GetBytes())
// The val is in TopN, return false.
if !outOfTopN {
return false
}
if !histEmpty {
withInLowBoundOrPrefixMatch := chunk.Compare(idx.Bounds.GetRow(0), 0, &val) <= 0 ||
matchPrefix(idx.Bounds.GetRow(0), 0, &val)
withInHighBound := chunk.Compare(idx.Bounds.GetRow(idx.Bounds.NumRows()-1), 0, &val) >= 0
// Hist is not empty. Record found.
if withInLowBoundOrPrefixMatch && withInHighBound {
return false
}

histEmpty := idx.Histogram.Len() == 0
// HistEmpty->Hist out of range.
if histEmpty {
return true
}
// No record found. Is out of range.
return true
withInLowBoundOrPrefixMatch := chunk.Compare(idx.Bounds.GetRow(0), 0, &val) <= 0 ||
matchPrefix(idx.Bounds.GetRow(0), 0, &val)
withInHighBound := chunk.Compare(idx.Bounds.GetRow(idx.Bounds.NumRows()-1), 0, &val) >= 0
return !withInLowBoundOrPrefixMatch || !withInHighBound
}

// matchPrefix checks whether ad is the prefix of value
Expand Down
8 changes: 4 additions & 4 deletions statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -649,19 +649,19 @@ func (s *testStatsSuite) TestTopNOutOfHist(c *C) {

testKit.MustExec("drop table if exists topn_before_hist")
testKit.MustExec("create table topn_before_hist(a int, index idx(a))")
testKit.MustExec("insert into topn_before_hist values(1), (1), (1), (1), (2), (2), (3), (4), (5)")
testKit.MustExec("insert into topn_before_hist values(1), (1), (1), (1), (3), (3), (4), (5), (6)")
testKit.MustExec("analyze table topn_before_hist with 2 topn, 3 buckets")

testKit.MustExec("create table topn_after_hist(a int, index idx(a))")
testKit.MustExec("insert into topn_after_hist values(2), (2), (3), (4), (5), (6), (6), (6), (6)")
testKit.MustExec("insert into topn_after_hist values(2), (2), (3), (4), (5), (7), (7), (7), (7)")
testKit.MustExec("analyze table topn_after_hist with 2 topn, 3 buckets")

testKit.MustExec("create table topn_before_hist_no_index(a int)")
testKit.MustExec("insert into topn_before_hist_no_index values(1), (1), (1), (1), (2), (2), (3), (4), (5)")
testKit.MustExec("insert into topn_before_hist_no_index values(1), (1), (1), (1), (3), (3), (4), (5), (6)")
testKit.MustExec("analyze table topn_before_hist_no_index with 2 topn, 3 buckets")

testKit.MustExec("create table topn_after_hist_no_index(a int)")
testKit.MustExec("insert into topn_after_hist_no_index values(2), (2), (3), (4), (5), (6), (6), (6), (6)")
testKit.MustExec("insert into topn_after_hist_no_index values(2), (2), (3), (4), (5), (7), (7), (7), (7)")
testKit.MustExec("analyze table topn_after_hist_no_index with 2 topn, 3 buckets")

var (
Expand Down
2 changes: 1 addition & 1 deletion statistics/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ func (s *testStatisticsSuite) TestPseudoTable(c *C) {
count, err := tbl.ColumnEqualRowCount(sc, types.NewIntDatum(1000), colInfo.ID)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 10)
count = tbl.ColumnBetweenRowCount(sc, types.NewIntDatum(1000), types.NewIntDatum(5000), colInfo.ID)
count, _ = tbl.ColumnBetweenRowCount(sc, types.NewIntDatum(1000), types.NewIntDatum(5000), colInfo.ID)
c.Assert(int(count), Equals, 250)
}

Expand Down
21 changes: 15 additions & 6 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,19 +277,24 @@ func (t *Table) ColumnLessRowCount(sc *stmtctx.StatementContext, value types.Dat
}

// ColumnBetweenRowCount estimates the row count where column greater or equal to a and less than b.
func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.Datum, colID int64) float64 {
func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.Datum, colID int64) (float64, error) {
c, ok := t.Columns[colID]
if !ok || c.IsInvalid(sc, t.Pseudo) {
return float64(t.Count) / pseudoBetweenRate
return float64(t.Count) / pseudoBetweenRate, nil
}
count, err := c.BetweenRowCount(sc, a, b)
aEncoded, err := codec.EncodeKey(sc, nil, a)
if err != nil {
return 0
return 0, err
}
bEncoded, err := codec.EncodeKey(sc, nil, b)
if err != nil {
return 0, err
}
count := c.BetweenRowCount(sc, a, b, aEncoded, bEncoded)
if a.IsNull() {
count += float64(c.NullCount)
}
return count * c.GetIncreaseFactor(t.Count)
return count * c.GetIncreaseFactor(t.Count), nil
}

// ColumnEqualRowCount estimates the row count where the column equals to value.
Expand All @@ -298,7 +303,11 @@ func (t *Table) ColumnEqualRowCount(sc *stmtctx.StatementContext, value types.Da
if !ok || c.IsInvalid(sc, t.Pseudo) {
return float64(t.Count) / pseudoEqualRate, nil
}
result, err := c.equalRowCount(sc, value, t.ModifyCount)
encodedVal, err := codec.EncodeKey(sc, nil, value)
if err != nil {
return 0, err
}
result, err := c.equalRowCount(sc, value, encodedVal, t.ModifyCount)
result *= c.GetIncreaseFactor(t.Count)
return result, errors.Trace(err)
}
Expand Down
6 changes: 5 additions & 1 deletion statistics/testdata/stats_suite_in.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,13 @@
"show stats_topn",
"show stats_buckets",
"explain select * from topn_before_hist where a = 1",
"explain select * from topn_before_hist where a = 2",
"explain select * from topn_after_hist where a = 7",
"explain select * from topn_after_hist where a = 6",
"explain select * from topn_after_hist_no_index where a = 7",
"explain select * from topn_after_hist_no_index where a = 6",
"explain select * from topn_before_hist_no_index where a = 1"
"explain select * from topn_before_hist_no_index where a = 1",
"explain select * from topn_before_hist_no_index where a = 2"
]
},
{
Expand Down
46 changes: 32 additions & 14 deletions statistics/testdata/stats_suite_out.json
Original file line number Diff line number Diff line change
Expand Up @@ -427,49 +427,67 @@
"Cases": [
[
"test topn_before_hist a 0 1 4",
"test topn_before_hist a 0 2 2",
"test topn_before_hist a 0 3 2",
"test topn_before_hist idx 1 1 4",
"test topn_before_hist idx 1 2 2",
"test topn_before_hist idx 1 3 2",
"test topn_after_hist a 0 2 2",
"test topn_after_hist a 0 6 4",
"test topn_after_hist a 0 7 4",
"test topn_after_hist idx 1 2 2",
"test topn_after_hist idx 1 6 4",
"test topn_after_hist idx 1 7 4",
"test topn_before_hist_no_index a 0 1 4",
"test topn_before_hist_no_index a 0 2 2",
"test topn_before_hist_no_index a 0 3 2",
"test topn_after_hist_no_index a 0 2 2",
"test topn_after_hist_no_index a 0 6 4"
"test topn_after_hist_no_index a 0 7 4"
],
[
"test topn_before_hist a 0 0 2 1 3 4 0",
"test topn_before_hist a 0 1 3 1 5 5 0",
"test topn_before_hist idx 1 0 2 1 3 4 0",
"test topn_before_hist idx 1 1 3 1 5 5 0",
"test topn_before_hist a 0 0 2 1 4 5 0",
"test topn_before_hist a 0 1 3 1 6 6 0",
"test topn_before_hist idx 1 0 2 1 4 5 0",
"test topn_before_hist idx 1 1 3 1 6 6 0",
"test topn_after_hist a 0 0 2 1 3 4 0",
"test topn_after_hist a 0 1 3 1 5 5 0",
"test topn_after_hist idx 1 0 2 1 3 4 0",
"test topn_after_hist idx 1 1 3 1 5 5 0",
"test topn_before_hist_no_index a 0 0 2 1 3 4 0",
"test topn_before_hist_no_index a 0 1 3 1 5 5 0",
"test topn_before_hist_no_index a 0 0 2 1 4 5 0",
"test topn_before_hist_no_index a 0 1 3 1 6 6 0",
"test topn_after_hist_no_index a 0 0 2 1 3 4 0",
"test topn_after_hist_no_index a 0 1 3 1 5 5 0"
],
[
"IndexReader_6 4.00 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 4.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[1,1], keep order:false"
],
[
"IndexReader_6 0.00 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 0.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[2,2], keep order:false"
],
[
"IndexReader_6 4.00 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 4.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false"
"└─IndexRangeScan_5 4.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[7,7], keep order:false"
],
[
"IndexReader_6 0.00 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 0.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false"
],
[
"TableReader_7 4.00 root data:Selection_6",
"└─Selection_6 4.00 cop[tikv] eq(test.topn_after_hist_no_index.a, 6)",
"└─Selection_6 4.00 cop[tikv] eq(test.topn_after_hist_no_index.a, 7)",
" └─TableFullScan_5 9.00 cop[tikv] table:topn_after_hist_no_index keep order:false"
],
[
"TableReader_7 1.00 root data:Selection_6",
"└─Selection_6 1.00 cop[tikv] eq(test.topn_after_hist_no_index.a, 6)",
" └─TableFullScan_5 9.00 cop[tikv] table:topn_after_hist_no_index keep order:false"
],
[
"TableReader_7 4.00 root data:Selection_6",
"└─Selection_6 4.00 cop[tikv] eq(test.topn_before_hist_no_index.a, 1)",
" └─TableFullScan_5 9.00 cop[tikv] table:topn_before_hist_no_index keep order:false"
],
[
"TableReader_7 1.00 root data:Selection_6",
"└─Selection_6 1.00 cop[tikv] eq(test.topn_before_hist_no_index.a, 2)",
" └─TableFullScan_5 9.00 cop[tikv] table:topn_before_hist_no_index keep order:false"
]
]
},
Expand Down