From 32cf14bd7dafdc23409fd0ee289c3742063f6478 Mon Sep 17 00:00:00 2001 From: Yiding Cui Date: Wed, 2 Jun 2021 22:16:25 +0800 Subject: [PATCH] statistics: relax the check of the OutOfRange (#24958) --- statistics/cmsketch.go | 8 +++ statistics/histogram.go | 92 ++++++++++++------------ statistics/selectivity_test.go | 8 +-- statistics/statistics_test.go | 2 +- statistics/table.go | 21 ++++-- statistics/testdata/stats_suite_in.json | 6 +- statistics/testdata/stats_suite_out.json | 46 ++++++++---- 7 files changed, 113 insertions(+), 70 deletions(-) diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index c510186b16c40..07d90434a6cc7 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -530,6 +530,14 @@ func (c *TopN) Num() int { return len(c.TopN) } +// outOfRange checks whether the the given value falls back in [TopN.LowestOne, TopN.HighestOne]. +func (c *TopN) outOfRange(val []byte) bool { + if c == nil || len(c.TopN) == 0 { + return true + } + return bytes.Compare(c.TopN[0].Encoded, val) > 0 || bytes.Compare(val, c.TopN[c.Num()-1].Encoded) > 0 +} + // DecodedString returns the value with decoded result. func (c *TopN) DecodedString(ctx sessionctx.Context, colTypes []byte) (string, error) { builder := &strings.Builder{} diff --git a/statistics/histogram.go b/statistics/histogram.go index 027950b8326c7..f377b14c0c067 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -506,20 +506,12 @@ func (hg *Histogram) BetweenRowCount(a, b types.Datum) float64 { } // BetweenRowCount estimates the row count for interval [l, r). -func (c *Column) BetweenRowCount(sc *stmtctx.StatementContext, l, r types.Datum) (float64, error) { +func (c *Column) BetweenRowCount(sc *stmtctx.StatementContext, l, r types.Datum, lowEncoded, highEncoded []byte) float64 { histBetweenCnt := c.Histogram.BetweenRowCount(l, r) if c.StatsVer <= Version1 { - return histBetweenCnt, nil - } - lBytes, err := codec.EncodeKey(sc, nil, l) - if err != nil { - return 0, errors.Trace(err) - } - rBytes, err := codec.EncodeKey(sc, nil, r) - if err != nil { - return 0, errors.Trace(err) + return histBetweenCnt } - return float64(c.TopN.BetweenCount(lBytes, rBytes)) + histBetweenCnt, nil + return float64(c.TopN.BetweenCount(lowEncoded, highEncoded)) + histBetweenCnt } // TotalRowCount returns the total count of this histogram. @@ -978,7 +970,7 @@ func (c *Column) IsInvalid(sc *stmtctx.StatementContext, collPseudo bool) bool { return c.TotalRowCount() == 0 || (c.Histogram.NDV > 0 && c.notNullCount() == 0) } -func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, modifyCount int64) (float64, error) { +func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, encodedVal []byte, modifyCount int64) (float64, error) { if val.IsNull() { return float64(c.NullCount), nil } @@ -987,7 +979,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo if c.Histogram.Bounds.NumRows() == 0 { return 0.0, nil } - if c.Histogram.NDV > 0 && c.outOfRange(val) { + if c.Histogram.NDV > 0 && c.outOfRange(val, encodedVal) { return outOfRangeEQSelectivity(c.Histogram.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil } if c.CMSketch != nil { @@ -996,14 +988,17 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo } return c.Histogram.equalRowCount(val, false), nil } + // All the values are null. + if c.Histogram.Bounds.NumRows() == 0 && c.TopN.Num() == 0 { + return 0, nil + } + if c.Histogram.NDV+int64(c.TopN.Num()) > 0 && c.outOfRange(val, encodedVal) { + return outOfRangeEQSelectivity(c.Histogram.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil + } // Stats version == 2 // 1. try to find this value in TopN if c.TopN != nil { - valBytes, err := codec.EncodeKey(sc, nil, val) - if err != nil { - return 0, errors.Trace(err) - } - rowcount, ok := c.QueryTopN(valBytes) + rowcount, ok := c.QueryTopN(encodedVal) if ok { return float64(rowcount), nil } @@ -1054,6 +1049,14 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range if err != nil { return 0, errors.Trace(err) } + lowEncoded, err := codec.EncodeKey(sc, nil, lowVal) + if err != nil { + return 0, err + } + highEncoded, err := codec.EncodeKey(sc, nil, highVal) + if err != nil { + return 0, err + } if cmp == 0 { // the point case. if !rg.LowExclude && !rg.HighExclude { @@ -1063,7 +1066,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range continue } var cnt float64 - cnt, err = c.equalRowCount(sc, lowVal, modifyCount) + cnt, err = c.equalRowCount(sc, lowVal, lowEncoded, modifyCount) if err != nil { return 0, errors.Trace(err) } @@ -1075,7 +1078,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range // The small range case. if rangeVals != nil { for _, val := range rangeVals { - cnt, err := c.equalRowCount(sc, val, modifyCount) + cnt, err := c.equalRowCount(sc, val, lowEncoded, modifyCount) if err != nil { return 0, err } @@ -1084,18 +1087,15 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range continue } // The interval case. - cnt, err := c.BetweenRowCount(sc, lowVal, highVal) - if err != nil { - return 0, err - } - if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) { + cnt := c.BetweenRowCount(sc, lowVal, highVal, lowEncoded, highEncoded) + if (c.outOfRange(lowVal, lowEncoded) && !lowVal.IsNull()) || c.outOfRange(highVal, highEncoded) { cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount() } // `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here. // Note that, `cnt` does not include null values, we need specially handle cases // where null is the lower bound. if rg.LowExclude && !lowVal.IsNull() { - lowCnt, err := c.equalRowCount(sc, lowVal, modifyCount) + lowCnt, err := c.equalRowCount(sc, lowVal, lowEncoded, modifyCount) if err != nil { return 0, errors.Trace(err) } @@ -1105,7 +1105,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range cnt += float64(c.NullCount) } if !rg.HighExclude { - highCnt, err := c.equalRowCount(sc, highVal, modifyCount) + highCnt, err := c.equalRowCount(sc, highVal, highEncoded, modifyCount) if err != nil { return 0, errors.Trace(err) } @@ -1121,6 +1121,15 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range return rowCount, nil } +func (c *Column) outOfRange(val types.Datum, encodedVal []byte) bool { + outOfHist := c.Histogram.outOfRange(val) + if !outOfHist { + return false + } + // Already out of hist. + return c.TopN.outOfRange(encodedVal) +} + // Index represents an index histogram. type Index struct { Histogram @@ -1504,26 +1513,21 @@ func (coll *HistColl) NewHistCollBySelectivity(sc *stmtctx.StatementContext, sta } func (idx *Index) outOfRange(val types.Datum) bool { - histEmpty, topNEmpty := idx.Histogram.Len() == 0, idx.TopN.Num() == 0 - // All empty. - if histEmpty && topNEmpty { - return true - } - // TopN is not empty. Record found. - if !topNEmpty && idx.TopN.findTopN(val.GetBytes()) >= 0 { + outOfTopN := idx.TopN.outOfRange(val.GetBytes()) + // The val is in TopN, return false. + if !outOfTopN { return false } - if !histEmpty { - withInLowBoundOrPrefixMatch := chunk.Compare(idx.Bounds.GetRow(0), 0, &val) <= 0 || - matchPrefix(idx.Bounds.GetRow(0), 0, &val) - withInHighBound := chunk.Compare(idx.Bounds.GetRow(idx.Bounds.NumRows()-1), 0, &val) >= 0 - // Hist is not empty. Record found. - if withInLowBoundOrPrefixMatch && withInHighBound { - return false - } + + histEmpty := idx.Histogram.Len() == 0 + // HistEmpty->Hist out of range. + if histEmpty { + return true } - // No record found. Is out of range. - return true + withInLowBoundOrPrefixMatch := chunk.Compare(idx.Bounds.GetRow(0), 0, &val) <= 0 || + matchPrefix(idx.Bounds.GetRow(0), 0, &val) + withInHighBound := chunk.Compare(idx.Bounds.GetRow(idx.Bounds.NumRows()-1), 0, &val) >= 0 + return !withInLowBoundOrPrefixMatch || !withInHighBound } // matchPrefix checks whether ad is the prefix of value diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index 359e1d2db9585..ac12be22442b9 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -649,19 +649,19 @@ func (s *testStatsSuite) TestTopNOutOfHist(c *C) { testKit.MustExec("drop table if exists topn_before_hist") testKit.MustExec("create table topn_before_hist(a int, index idx(a))") - testKit.MustExec("insert into topn_before_hist values(1), (1), (1), (1), (2), (2), (3), (4), (5)") + testKit.MustExec("insert into topn_before_hist values(1), (1), (1), (1), (3), (3), (4), (5), (6)") testKit.MustExec("analyze table topn_before_hist with 2 topn, 3 buckets") testKit.MustExec("create table topn_after_hist(a int, index idx(a))") - testKit.MustExec("insert into topn_after_hist values(2), (2), (3), (4), (5), (6), (6), (6), (6)") + testKit.MustExec("insert into topn_after_hist values(2), (2), (3), (4), (5), (7), (7), (7), (7)") testKit.MustExec("analyze table topn_after_hist with 2 topn, 3 buckets") testKit.MustExec("create table topn_before_hist_no_index(a int)") - testKit.MustExec("insert into topn_before_hist_no_index values(1), (1), (1), (1), (2), (2), (3), (4), (5)") + testKit.MustExec("insert into topn_before_hist_no_index values(1), (1), (1), (1), (3), (3), (4), (5), (6)") testKit.MustExec("analyze table topn_before_hist_no_index with 2 topn, 3 buckets") testKit.MustExec("create table topn_after_hist_no_index(a int)") - testKit.MustExec("insert into topn_after_hist_no_index values(2), (2), (3), (4), (5), (6), (6), (6), (6)") + testKit.MustExec("insert into topn_after_hist_no_index values(2), (2), (3), (4), (5), (7), (7), (7), (7)") testKit.MustExec("analyze table topn_after_hist_no_index with 2 topn, 3 buckets") var ( diff --git a/statistics/statistics_test.go b/statistics/statistics_test.go index 7fd0bf64b0bf5..c352c3576c89b 100644 --- a/statistics/statistics_test.go +++ b/statistics/statistics_test.go @@ -458,7 +458,7 @@ func (s *testStatisticsSuite) TestPseudoTable(c *C) { count, err := tbl.ColumnEqualRowCount(sc, types.NewIntDatum(1000), colInfo.ID) c.Assert(err, IsNil) c.Assert(int(count), Equals, 10) - count = tbl.ColumnBetweenRowCount(sc, types.NewIntDatum(1000), types.NewIntDatum(5000), colInfo.ID) + count, _ = tbl.ColumnBetweenRowCount(sc, types.NewIntDatum(1000), types.NewIntDatum(5000), colInfo.ID) c.Assert(int(count), Equals, 250) } diff --git a/statistics/table.go b/statistics/table.go index 7628e018e25a5..85807fbefc67f 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -277,19 +277,24 @@ func (t *Table) ColumnLessRowCount(sc *stmtctx.StatementContext, value types.Dat } // ColumnBetweenRowCount estimates the row count where column greater or equal to a and less than b. -func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.Datum, colID int64) float64 { +func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.Datum, colID int64) (float64, error) { c, ok := t.Columns[colID] if !ok || c.IsInvalid(sc, t.Pseudo) { - return float64(t.Count) / pseudoBetweenRate + return float64(t.Count) / pseudoBetweenRate, nil } - count, err := c.BetweenRowCount(sc, a, b) + aEncoded, err := codec.EncodeKey(sc, nil, a) if err != nil { - return 0 + return 0, err } + bEncoded, err := codec.EncodeKey(sc, nil, b) + if err != nil { + return 0, err + } + count := c.BetweenRowCount(sc, a, b, aEncoded, bEncoded) if a.IsNull() { count += float64(c.NullCount) } - return count * c.GetIncreaseFactor(t.Count) + return count * c.GetIncreaseFactor(t.Count), nil } // ColumnEqualRowCount estimates the row count where the column equals to value. @@ -298,7 +303,11 @@ func (t *Table) ColumnEqualRowCount(sc *stmtctx.StatementContext, value types.Da if !ok || c.IsInvalid(sc, t.Pseudo) { return float64(t.Count) / pseudoEqualRate, nil } - result, err := c.equalRowCount(sc, value, t.ModifyCount) + encodedVal, err := codec.EncodeKey(sc, nil, value) + if err != nil { + return 0, err + } + result, err := c.equalRowCount(sc, value, encodedVal, t.ModifyCount) result *= c.GetIncreaseFactor(t.Count) return result, errors.Trace(err) } diff --git a/statistics/testdata/stats_suite_in.json b/statistics/testdata/stats_suite_in.json index 631e2aa6c60e2..b20b6d8300433 100644 --- a/statistics/testdata/stats_suite_in.json +++ b/statistics/testdata/stats_suite_in.json @@ -72,9 +72,13 @@ "show stats_topn", "show stats_buckets", "explain select * from topn_before_hist where a = 1", + "explain select * from topn_before_hist where a = 2", + "explain select * from topn_after_hist where a = 7", "explain select * from topn_after_hist where a = 6", + "explain select * from topn_after_hist_no_index where a = 7", "explain select * from topn_after_hist_no_index where a = 6", - "explain select * from topn_before_hist_no_index where a = 1" + "explain select * from topn_before_hist_no_index where a = 1", + "explain select * from topn_before_hist_no_index where a = 2" ] }, { diff --git a/statistics/testdata/stats_suite_out.json b/statistics/testdata/stats_suite_out.json index c25f082455c2f..b60a351b6cead 100644 --- a/statistics/testdata/stats_suite_out.json +++ b/statistics/testdata/stats_suite_out.json @@ -427,29 +427,29 @@ "Cases": [ [ "test topn_before_hist a 0 1 4", - "test topn_before_hist a 0 2 2", + "test topn_before_hist a 0 3 2", "test topn_before_hist idx 1 1 4", - "test topn_before_hist idx 1 2 2", + "test topn_before_hist idx 1 3 2", "test topn_after_hist a 0 2 2", - "test topn_after_hist a 0 6 4", + "test topn_after_hist a 0 7 4", "test topn_after_hist idx 1 2 2", - "test topn_after_hist idx 1 6 4", + "test topn_after_hist idx 1 7 4", "test topn_before_hist_no_index a 0 1 4", - "test topn_before_hist_no_index a 0 2 2", + "test topn_before_hist_no_index a 0 3 2", "test topn_after_hist_no_index a 0 2 2", - "test topn_after_hist_no_index a 0 6 4" + "test topn_after_hist_no_index a 0 7 4" ], [ - "test topn_before_hist a 0 0 2 1 3 4 0", - "test topn_before_hist a 0 1 3 1 5 5 0", - "test topn_before_hist idx 1 0 2 1 3 4 0", - "test topn_before_hist idx 1 1 3 1 5 5 0", + "test topn_before_hist a 0 0 2 1 4 5 0", + "test topn_before_hist a 0 1 3 1 6 6 0", + "test topn_before_hist idx 1 0 2 1 4 5 0", + "test topn_before_hist idx 1 1 3 1 6 6 0", "test topn_after_hist a 0 0 2 1 3 4 0", "test topn_after_hist a 0 1 3 1 5 5 0", "test topn_after_hist idx 1 0 2 1 3 4 0", "test topn_after_hist idx 1 1 3 1 5 5 0", - "test topn_before_hist_no_index a 0 0 2 1 3 4 0", - "test topn_before_hist_no_index a 0 1 3 1 5 5 0", + "test topn_before_hist_no_index a 0 0 2 1 4 5 0", + "test topn_before_hist_no_index a 0 1 3 1 6 6 0", "test topn_after_hist_no_index a 0 0 2 1 3 4 0", "test topn_after_hist_no_index a 0 1 3 1 5 5 0" ], @@ -457,19 +457,37 @@ "IndexReader_6 4.00 root index:IndexRangeScan_5", "└─IndexRangeScan_5 4.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[1,1], keep order:false" ], + [ + "IndexReader_6 0.00 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 0.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[2,2], keep order:false" + ], [ "IndexReader_6 4.00 root index:IndexRangeScan_5", - "└─IndexRangeScan_5 4.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false" + "└─IndexRangeScan_5 4.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[7,7], keep order:false" + ], + [ + "IndexReader_6 0.00 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 0.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false" ], [ "TableReader_7 4.00 root data:Selection_6", - "└─Selection_6 4.00 cop[tikv] eq(test.topn_after_hist_no_index.a, 6)", + "└─Selection_6 4.00 cop[tikv] eq(test.topn_after_hist_no_index.a, 7)", + " └─TableFullScan_5 9.00 cop[tikv] table:topn_after_hist_no_index keep order:false" + ], + [ + "TableReader_7 1.00 root data:Selection_6", + "└─Selection_6 1.00 cop[tikv] eq(test.topn_after_hist_no_index.a, 6)", " └─TableFullScan_5 9.00 cop[tikv] table:topn_after_hist_no_index keep order:false" ], [ "TableReader_7 4.00 root data:Selection_6", "└─Selection_6 4.00 cop[tikv] eq(test.topn_before_hist_no_index.a, 1)", " └─TableFullScan_5 9.00 cop[tikv] table:topn_before_hist_no_index keep order:false" + ], + [ + "TableReader_7 1.00 root data:Selection_6", + "└─Selection_6 1.00 cop[tikv] eq(test.topn_before_hist_no_index.a, 2)", + " └─TableFullScan_5 9.00 cop[tikv] table:topn_before_hist_no_index keep order:false" ] ] },