From 32cf14bd7dafdc23409fd0ee289c3742063f6478 Mon Sep 17 00:00:00 2001
From: Yiding Cui <winoros@gmail.com>
Date: Wed, 2 Jun 2021 22:16:25 +0800
Subject: [PATCH] statistics: relax the check of the OutOfRange (#24958)

---
 statistics/cmsketch.go                   |  8 +++
 statistics/histogram.go                  | 92 ++++++++++++------------
 statistics/selectivity_test.go           |  8 +--
 statistics/statistics_test.go            |  2 +-
 statistics/table.go                      | 21 ++++--
 statistics/testdata/stats_suite_in.json  |  6 +-
 statistics/testdata/stats_suite_out.json | 46 ++++++++----
 7 files changed, 113 insertions(+), 70 deletions(-)

diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go
index c510186b16c40..07d90434a6cc7 100644
--- a/statistics/cmsketch.go
+++ b/statistics/cmsketch.go
@@ -530,6 +530,14 @@ func (c *TopN) Num() int {
 	return len(c.TopN)
 }
 
+// outOfRange checks whether the the given value falls back in [TopN.LowestOne, TopN.HighestOne].
+func (c *TopN) outOfRange(val []byte) bool {
+	if c == nil || len(c.TopN) == 0 {
+		return true
+	}
+	return bytes.Compare(c.TopN[0].Encoded, val) > 0 || bytes.Compare(val, c.TopN[c.Num()-1].Encoded) > 0
+}
+
 // DecodedString returns the value with decoded result.
 func (c *TopN) DecodedString(ctx sessionctx.Context, colTypes []byte) (string, error) {
 	builder := &strings.Builder{}
diff --git a/statistics/histogram.go b/statistics/histogram.go
index 027950b8326c7..f377b14c0c067 100644
--- a/statistics/histogram.go
+++ b/statistics/histogram.go
@@ -506,20 +506,12 @@ func (hg *Histogram) BetweenRowCount(a, b types.Datum) float64 {
 }
 
 // BetweenRowCount estimates the row count for interval [l, r).
-func (c *Column) BetweenRowCount(sc *stmtctx.StatementContext, l, r types.Datum) (float64, error) {
+func (c *Column) BetweenRowCount(sc *stmtctx.StatementContext, l, r types.Datum, lowEncoded, highEncoded []byte) float64 {
 	histBetweenCnt := c.Histogram.BetweenRowCount(l, r)
 	if c.StatsVer <= Version1 {
-		return histBetweenCnt, nil
-	}
-	lBytes, err := codec.EncodeKey(sc, nil, l)
-	if err != nil {
-		return 0, errors.Trace(err)
-	}
-	rBytes, err := codec.EncodeKey(sc, nil, r)
-	if err != nil {
-		return 0, errors.Trace(err)
+		return histBetweenCnt
 	}
-	return float64(c.TopN.BetweenCount(lBytes, rBytes)) + histBetweenCnt, nil
+	return float64(c.TopN.BetweenCount(lowEncoded, highEncoded)) + histBetweenCnt
 }
 
 // TotalRowCount returns the total count of this histogram.
@@ -978,7 +970,7 @@ func (c *Column) IsInvalid(sc *stmtctx.StatementContext, collPseudo bool) bool {
 	return c.TotalRowCount() == 0 || (c.Histogram.NDV > 0 && c.notNullCount() == 0)
 }
 
-func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, modifyCount int64) (float64, error) {
+func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, encodedVal []byte, modifyCount int64) (float64, error) {
 	if val.IsNull() {
 		return float64(c.NullCount), nil
 	}
@@ -987,7 +979,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
 		if c.Histogram.Bounds.NumRows() == 0 {
 			return 0.0, nil
 		}
-		if c.Histogram.NDV > 0 && c.outOfRange(val) {
+		if c.Histogram.NDV > 0 && c.outOfRange(val, encodedVal) {
 			return outOfRangeEQSelectivity(c.Histogram.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
 		}
 		if c.CMSketch != nil {
@@ -996,14 +988,17 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
 		}
 		return c.Histogram.equalRowCount(val, false), nil
 	}
+	// All the values are null.
+	if c.Histogram.Bounds.NumRows() == 0 && c.TopN.Num() == 0 {
+		return 0, nil
+	}
+	if c.Histogram.NDV+int64(c.TopN.Num()) > 0 && c.outOfRange(val, encodedVal) {
+		return outOfRangeEQSelectivity(c.Histogram.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
+	}
 	// Stats version == 2
 	// 1. try to find this value in TopN
 	if c.TopN != nil {
-		valBytes, err := codec.EncodeKey(sc, nil, val)
-		if err != nil {
-			return 0, errors.Trace(err)
-		}
-		rowcount, ok := c.QueryTopN(valBytes)
+		rowcount, ok := c.QueryTopN(encodedVal)
 		if ok {
 			return float64(rowcount), nil
 		}
@@ -1054,6 +1049,14 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 		if err != nil {
 			return 0, errors.Trace(err)
 		}
+		lowEncoded, err := codec.EncodeKey(sc, nil, lowVal)
+		if err != nil {
+			return 0, err
+		}
+		highEncoded, err := codec.EncodeKey(sc, nil, highVal)
+		if err != nil {
+			return 0, err
+		}
 		if cmp == 0 {
 			// the point case.
 			if !rg.LowExclude && !rg.HighExclude {
@@ -1063,7 +1066,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 					continue
 				}
 				var cnt float64
-				cnt, err = c.equalRowCount(sc, lowVal, modifyCount)
+				cnt, err = c.equalRowCount(sc, lowVal, lowEncoded, modifyCount)
 				if err != nil {
 					return 0, errors.Trace(err)
 				}
@@ -1075,7 +1078,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 		// The small range case.
 		if rangeVals != nil {
 			for _, val := range rangeVals {
-				cnt, err := c.equalRowCount(sc, val, modifyCount)
+				cnt, err := c.equalRowCount(sc, val, lowEncoded, modifyCount)
 				if err != nil {
 					return 0, err
 				}
@@ -1084,18 +1087,15 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 			continue
 		}
 		// The interval case.
-		cnt, err := c.BetweenRowCount(sc, lowVal, highVal)
-		if err != nil {
-			return 0, err
-		}
-		if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) {
+		cnt := c.BetweenRowCount(sc, lowVal, highVal, lowEncoded, highEncoded)
+		if (c.outOfRange(lowVal, lowEncoded) && !lowVal.IsNull()) || c.outOfRange(highVal, highEncoded) {
 			cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount()
 		}
 		// `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here.
 		// Note that, `cnt` does not include null values, we need specially handle cases
 		// where null is the lower bound.
 		if rg.LowExclude && !lowVal.IsNull() {
-			lowCnt, err := c.equalRowCount(sc, lowVal, modifyCount)
+			lowCnt, err := c.equalRowCount(sc, lowVal, lowEncoded, modifyCount)
 			if err != nil {
 				return 0, errors.Trace(err)
 			}
@@ -1105,7 +1105,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 			cnt += float64(c.NullCount)
 		}
 		if !rg.HighExclude {
-			highCnt, err := c.equalRowCount(sc, highVal, modifyCount)
+			highCnt, err := c.equalRowCount(sc, highVal, highEncoded, modifyCount)
 			if err != nil {
 				return 0, errors.Trace(err)
 			}
@@ -1121,6 +1121,15 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 	return rowCount, nil
 }
 
+func (c *Column) outOfRange(val types.Datum, encodedVal []byte) bool {
+	outOfHist := c.Histogram.outOfRange(val)
+	if !outOfHist {
+		return false
+	}
+	// Already out of hist.
+	return c.TopN.outOfRange(encodedVal)
+}
+
 // Index represents an index histogram.
 type Index struct {
 	Histogram
@@ -1504,26 +1513,21 @@ func (coll *HistColl) NewHistCollBySelectivity(sc *stmtctx.StatementContext, sta
 }
 
 func (idx *Index) outOfRange(val types.Datum) bool {
-	histEmpty, topNEmpty := idx.Histogram.Len() == 0, idx.TopN.Num() == 0
-	// All empty.
-	if histEmpty && topNEmpty {
-		return true
-	}
-	// TopN is not empty. Record found.
-	if !topNEmpty && idx.TopN.findTopN(val.GetBytes()) >= 0 {
+	outOfTopN := idx.TopN.outOfRange(val.GetBytes())
+	// The val is in TopN, return false.
+	if !outOfTopN {
 		return false
 	}
-	if !histEmpty {
-		withInLowBoundOrPrefixMatch := chunk.Compare(idx.Bounds.GetRow(0), 0, &val) <= 0 ||
-			matchPrefix(idx.Bounds.GetRow(0), 0, &val)
-		withInHighBound := chunk.Compare(idx.Bounds.GetRow(idx.Bounds.NumRows()-1), 0, &val) >= 0
-		// Hist is not empty. Record found.
-		if withInLowBoundOrPrefixMatch && withInHighBound {
-			return false
-		}
+
+	histEmpty := idx.Histogram.Len() == 0
+	// HistEmpty->Hist out of range.
+	if histEmpty {
+		return true
 	}
-	// No record found. Is out of range.
-	return true
+	withInLowBoundOrPrefixMatch := chunk.Compare(idx.Bounds.GetRow(0), 0, &val) <= 0 ||
+		matchPrefix(idx.Bounds.GetRow(0), 0, &val)
+	withInHighBound := chunk.Compare(idx.Bounds.GetRow(idx.Bounds.NumRows()-1), 0, &val) >= 0
+	return !withInLowBoundOrPrefixMatch || !withInHighBound
 }
 
 // matchPrefix checks whether ad is the prefix of value
diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go
index 359e1d2db9585..ac12be22442b9 100644
--- a/statistics/selectivity_test.go
+++ b/statistics/selectivity_test.go
@@ -649,19 +649,19 @@ func (s *testStatsSuite) TestTopNOutOfHist(c *C) {
 
 	testKit.MustExec("drop table if exists topn_before_hist")
 	testKit.MustExec("create table topn_before_hist(a int, index idx(a))")
-	testKit.MustExec("insert into topn_before_hist values(1), (1), (1), (1), (2), (2), (3), (4), (5)")
+	testKit.MustExec("insert into topn_before_hist values(1), (1), (1), (1), (3), (3), (4), (5), (6)")
 	testKit.MustExec("analyze table topn_before_hist with 2 topn, 3 buckets")
 
 	testKit.MustExec("create table topn_after_hist(a int, index idx(a))")
-	testKit.MustExec("insert into topn_after_hist values(2), (2), (3), (4), (5), (6), (6), (6), (6)")
+	testKit.MustExec("insert into topn_after_hist values(2), (2), (3), (4), (5), (7), (7), (7), (7)")
 	testKit.MustExec("analyze table topn_after_hist with 2 topn, 3 buckets")
 
 	testKit.MustExec("create table topn_before_hist_no_index(a int)")
-	testKit.MustExec("insert into topn_before_hist_no_index values(1), (1), (1), (1), (2), (2), (3), (4), (5)")
+	testKit.MustExec("insert into topn_before_hist_no_index values(1), (1), (1), (1), (3), (3), (4), (5), (6)")
 	testKit.MustExec("analyze table topn_before_hist_no_index with 2 topn, 3 buckets")
 
 	testKit.MustExec("create table topn_after_hist_no_index(a int)")
-	testKit.MustExec("insert into topn_after_hist_no_index values(2), (2), (3), (4), (5), (6), (6), (6), (6)")
+	testKit.MustExec("insert into topn_after_hist_no_index values(2), (2), (3), (4), (5), (7), (7), (7), (7)")
 	testKit.MustExec("analyze table topn_after_hist_no_index with 2 topn, 3 buckets")
 
 	var (
diff --git a/statistics/statistics_test.go b/statistics/statistics_test.go
index 7fd0bf64b0bf5..c352c3576c89b 100644
--- a/statistics/statistics_test.go
+++ b/statistics/statistics_test.go
@@ -458,7 +458,7 @@ func (s *testStatisticsSuite) TestPseudoTable(c *C) {
 	count, err := tbl.ColumnEqualRowCount(sc, types.NewIntDatum(1000), colInfo.ID)
 	c.Assert(err, IsNil)
 	c.Assert(int(count), Equals, 10)
-	count = tbl.ColumnBetweenRowCount(sc, types.NewIntDatum(1000), types.NewIntDatum(5000), colInfo.ID)
+	count, _ = tbl.ColumnBetweenRowCount(sc, types.NewIntDatum(1000), types.NewIntDatum(5000), colInfo.ID)
 	c.Assert(int(count), Equals, 250)
 }
 
diff --git a/statistics/table.go b/statistics/table.go
index 7628e018e25a5..85807fbefc67f 100644
--- a/statistics/table.go
+++ b/statistics/table.go
@@ -277,19 +277,24 @@ func (t *Table) ColumnLessRowCount(sc *stmtctx.StatementContext, value types.Dat
 }
 
 // ColumnBetweenRowCount estimates the row count where column greater or equal to a and less than b.
-func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.Datum, colID int64) float64 {
+func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.Datum, colID int64) (float64, error) {
 	c, ok := t.Columns[colID]
 	if !ok || c.IsInvalid(sc, t.Pseudo) {
-		return float64(t.Count) / pseudoBetweenRate
+		return float64(t.Count) / pseudoBetweenRate, nil
 	}
-	count, err := c.BetweenRowCount(sc, a, b)
+	aEncoded, err := codec.EncodeKey(sc, nil, a)
 	if err != nil {
-		return 0
+		return 0, err
 	}
+	bEncoded, err := codec.EncodeKey(sc, nil, b)
+	if err != nil {
+		return 0, err
+	}
+	count := c.BetweenRowCount(sc, a, b, aEncoded, bEncoded)
 	if a.IsNull() {
 		count += float64(c.NullCount)
 	}
-	return count * c.GetIncreaseFactor(t.Count)
+	return count * c.GetIncreaseFactor(t.Count), nil
 }
 
 // ColumnEqualRowCount estimates the row count where the column equals to value.
@@ -298,7 +303,11 @@ func (t *Table) ColumnEqualRowCount(sc *stmtctx.StatementContext, value types.Da
 	if !ok || c.IsInvalid(sc, t.Pseudo) {
 		return float64(t.Count) / pseudoEqualRate, nil
 	}
-	result, err := c.equalRowCount(sc, value, t.ModifyCount)
+	encodedVal, err := codec.EncodeKey(sc, nil, value)
+	if err != nil {
+		return 0, err
+	}
+	result, err := c.equalRowCount(sc, value, encodedVal, t.ModifyCount)
 	result *= c.GetIncreaseFactor(t.Count)
 	return result, errors.Trace(err)
 }
diff --git a/statistics/testdata/stats_suite_in.json b/statistics/testdata/stats_suite_in.json
index 631e2aa6c60e2..b20b6d8300433 100644
--- a/statistics/testdata/stats_suite_in.json
+++ b/statistics/testdata/stats_suite_in.json
@@ -72,9 +72,13 @@
       "show stats_topn",
       "show stats_buckets",
       "explain select * from topn_before_hist where a = 1",
+      "explain select * from topn_before_hist where a = 2",
+      "explain select * from topn_after_hist where a = 7",
       "explain select * from topn_after_hist where a = 6",
+      "explain select * from topn_after_hist_no_index where a = 7",
       "explain select * from topn_after_hist_no_index where a = 6",
-      "explain select * from topn_before_hist_no_index where a = 1"
+      "explain select * from topn_before_hist_no_index where a = 1",
+      "explain select * from topn_before_hist_no_index where a = 2"
     ]
   },
   {
diff --git a/statistics/testdata/stats_suite_out.json b/statistics/testdata/stats_suite_out.json
index c25f082455c2f..b60a351b6cead 100644
--- a/statistics/testdata/stats_suite_out.json
+++ b/statistics/testdata/stats_suite_out.json
@@ -427,29 +427,29 @@
     "Cases": [
       [
         "test topn_before_hist  a 0 1 4",
-        "test topn_before_hist  a 0 2 2",
+        "test topn_before_hist  a 0 3 2",
         "test topn_before_hist  idx 1 1 4",
-        "test topn_before_hist  idx 1 2 2",
+        "test topn_before_hist  idx 1 3 2",
         "test topn_after_hist  a 0 2 2",
-        "test topn_after_hist  a 0 6 4",
+        "test topn_after_hist  a 0 7 4",
         "test topn_after_hist  idx 1 2 2",
-        "test topn_after_hist  idx 1 6 4",
+        "test topn_after_hist  idx 1 7 4",
         "test topn_before_hist_no_index  a 0 1 4",
-        "test topn_before_hist_no_index  a 0 2 2",
+        "test topn_before_hist_no_index  a 0 3 2",
         "test topn_after_hist_no_index  a 0 2 2",
-        "test topn_after_hist_no_index  a 0 6 4"
+        "test topn_after_hist_no_index  a 0 7 4"
       ],
       [
-        "test topn_before_hist  a 0 0 2 1 3 4 0",
-        "test topn_before_hist  a 0 1 3 1 5 5 0",
-        "test topn_before_hist  idx 1 0 2 1 3 4 0",
-        "test topn_before_hist  idx 1 1 3 1 5 5 0",
+        "test topn_before_hist  a 0 0 2 1 4 5 0",
+        "test topn_before_hist  a 0 1 3 1 6 6 0",
+        "test topn_before_hist  idx 1 0 2 1 4 5 0",
+        "test topn_before_hist  idx 1 1 3 1 6 6 0",
         "test topn_after_hist  a 0 0 2 1 3 4 0",
         "test topn_after_hist  a 0 1 3 1 5 5 0",
         "test topn_after_hist  idx 1 0 2 1 3 4 0",
         "test topn_after_hist  idx 1 1 3 1 5 5 0",
-        "test topn_before_hist_no_index  a 0 0 2 1 3 4 0",
-        "test topn_before_hist_no_index  a 0 1 3 1 5 5 0",
+        "test topn_before_hist_no_index  a 0 0 2 1 4 5 0",
+        "test topn_before_hist_no_index  a 0 1 3 1 6 6 0",
         "test topn_after_hist_no_index  a 0 0 2 1 3 4 0",
         "test topn_after_hist_no_index  a 0 1 3 1 5 5 0"
       ],
@@ -457,19 +457,37 @@
         "IndexReader_6 4.00 root  index:IndexRangeScan_5",
         "└─IndexRangeScan_5 4.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[1,1], keep order:false"
       ],
+      [
+        "IndexReader_6 0.00 root  index:IndexRangeScan_5",
+        "└─IndexRangeScan_5 0.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[2,2], keep order:false"
+      ],
       [
         "IndexReader_6 4.00 root  index:IndexRangeScan_5",
-        "└─IndexRangeScan_5 4.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false"
+        "└─IndexRangeScan_5 4.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[7,7], keep order:false"
+      ],
+      [
+        "IndexReader_6 0.00 root  index:IndexRangeScan_5",
+        "└─IndexRangeScan_5 0.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false"
       ],
       [
         "TableReader_7 4.00 root  data:Selection_6",
-        "└─Selection_6 4.00 cop[tikv]  eq(test.topn_after_hist_no_index.a, 6)",
+        "└─Selection_6 4.00 cop[tikv]  eq(test.topn_after_hist_no_index.a, 7)",
+        "  └─TableFullScan_5 9.00 cop[tikv] table:topn_after_hist_no_index keep order:false"
+      ],
+      [
+        "TableReader_7 1.00 root  data:Selection_6",
+        "└─Selection_6 1.00 cop[tikv]  eq(test.topn_after_hist_no_index.a, 6)",
         "  └─TableFullScan_5 9.00 cop[tikv] table:topn_after_hist_no_index keep order:false"
       ],
       [
         "TableReader_7 4.00 root  data:Selection_6",
         "└─Selection_6 4.00 cop[tikv]  eq(test.topn_before_hist_no_index.a, 1)",
         "  └─TableFullScan_5 9.00 cop[tikv] table:topn_before_hist_no_index keep order:false"
+      ],
+      [
+        "TableReader_7 1.00 root  data:Selection_6",
+        "└─Selection_6 1.00 cop[tikv]  eq(test.topn_before_hist_no_index.a, 2)",
+        "  └─TableFullScan_5 9.00 cop[tikv] table:topn_before_hist_no_index keep order:false"
       ]
     ]
   },