From 44e6c3c2e110238dd9ff834a24ec3e2cb0dc6062 Mon Sep 17 00:00:00 2001
From: Haibin Xie <lambdax.tyler@gmail.com>
Date: Mon, 6 Aug 2018 17:39:33 +0800
Subject: [PATCH] plan, stats: fix inconsistent row count estimation (#7233)

---
 cmd/explaintest/r/explain_easy.result       | 21 +++++++------
 cmd/explaintest/r/explain_easy_stats.result | 11 ++++---
 executor/analyze.go                         | 17 ++++++++--
 plan/cbo_test.go                            | 35 +++++++++++++++++++++
 plan/logical_plans.go                       | 10 ++++++
 statistics/selectivity.go                   | 16 +++++++---
 6 files changed, 87 insertions(+), 23 deletions(-)

diff --git a/cmd/explaintest/r/explain_easy.result b/cmd/explaintest/r/explain_easy.result
index 51cdef794fe6b..2e096f0e4cfc1 100644
--- a/cmd/explaintest/r/explain_easy.result
+++ b/cmd/explaintest/r/explain_easy.result
@@ -112,13 +112,14 @@ MemTableScan_4	10000.00	root
 explain select c2 = (select c2 from t2 where t1.c1 = t2.c1 order by c1 limit 1) from t1;
 id	count	task	operator info
 Projection_12	10000.00	root	eq(test.t1.c2, test.t2.c2)
-└─Apply_14	10000.00	root	left outer join, inner:Limit_22
+└─Apply_14	10000.00	root	left outer join, inner:Limit_21
   ├─TableReader_16	10000.00	root	data:TableScan_15
   │ └─TableScan_15	10000.00	cop	table:t1, range:[-inf,+inf], keep order:false, stats:pseudo
-  └─Limit_22	1.00	root	offset:0, count:1
-    └─IndexLookUp_47	0.00	root	
-      ├─IndexScan_45	0.00	cop	table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true, stats:pseudo
-      └─TableScan_46	0.00	cop	table:t2, keep order:false, stats:pseudo
+  └─Limit_21	1.00	root	offset:0, count:1
+    └─IndexLookUp_43	1.00	root	
+      ├─Limit_42	1.00	cop	offset:0, count:1
+      │ └─IndexScan_40	1.25	cop	table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true, stats:pseudo
+      └─TableScan_41	1.00	cop	table:t2, keep order:false
 explain select * from t1 order by c1 desc limit 1;
 id	count	task	operator info
 Limit_10	1.00	root	offset:0, count:1
@@ -286,8 +287,8 @@ Projection_11	10000.00	root	9_aux_0
   │ └─TableScan_14	10000.00	cop	table:t, range:[-inf,+inf], keep order:false, stats:pseudo
   └─StreamAgg_20	1.00	root	funcs:count(1)
     └─IndexJoin_32	10000.00	root	inner join, inner:TableReader_31, outer key:s.a, inner key:t1.a
-      ├─IndexReader_36	10.00	root	index:IndexScan_35
-      │ └─IndexScan_35	10.00	cop	table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
+      ├─IndexReader_36	10000.00	root	index:IndexScan_35
+      │ └─IndexScan_35	10000.00	cop	table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
       └─TableReader_31	10.00	root	data:TableScan_30
         └─TableScan_30	10.00	cop	table:t1, range: decided by [s.a], keep order:false, stats:pseudo
 explain select t.c in (select count(*) from t s use index(idx), t t1 where s.b = t.a and s.c = t1.a) from t;
@@ -298,9 +299,9 @@ Projection_11	10000.00	root	9_aux_0
   │ └─TableScan_14	10000.00	cop	table:t, range:[-inf,+inf], keep order:false, stats:pseudo
   └─StreamAgg_20	1.00	root	funcs:count(1)
     └─IndexJoin_33	10000.00	root	inner join, inner:TableReader_32, outer key:s.c, inner key:t1.a
-      ├─IndexLookUp_38	10.00	root	
-      │ ├─IndexScan_36	10.00	cop	table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
-      │ └─TableScan_37	10.00	cop	table:t, keep order:false, stats:pseudo
+      ├─IndexLookUp_38	10000.00	root	
+      │ ├─IndexScan_36	10000.00	cop	table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
+      │ └─TableScan_37	10000.00	cop	table:t, keep order:false, stats:pseudo
       └─TableReader_32	10.00	root	data:TableScan_31
         └─TableScan_31	10.00	cop	table:t1, range: decided by [s.c], keep order:false, stats:pseudo
 drop table if exists t;
diff --git a/cmd/explaintest/r/explain_easy_stats.result b/cmd/explaintest/r/explain_easy_stats.result
index 3e314cb6c1054..740fa7b9fea4b 100644
--- a/cmd/explaintest/r/explain_easy_stats.result
+++ b/cmd/explaintest/r/explain_easy_stats.result
@@ -101,13 +101,14 @@ MemTableScan_4	10000.00	root
 explain select c2 = (select c2 from t2 where t1.c1 = t2.c1 order by c1 limit 1) from t1;
 id	count	task	operator info
 Projection_12	1999.00	root	eq(test.t1.c2, test.t2.c2)
-└─Apply_14	1999.00	root	left outer join, inner:Limit_22
+└─Apply_14	1999.00	root	left outer join, inner:Limit_21
   ├─TableReader_16	1999.00	root	data:TableScan_15
   │ └─TableScan_15	1999.00	cop	table:t1, range:[-inf,+inf], keep order:false
-  └─Limit_22	1.00	root	offset:0, count:1
-    └─IndexLookUp_47	0.00	root	
-      ├─IndexScan_45	0.00	cop	table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true
-      └─TableScan_46	0.00	cop	table:t2, keep order:false
+  └─Limit_21	1.00	root	offset:0, count:1
+    └─IndexLookUp_43	1.00	root	
+      ├─Limit_42	1.00	cop	offset:0, count:1
+      │ └─IndexScan_40	1.25	cop	table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true
+      └─TableScan_41	1.00	cop	table:t2, keep order:false
 explain select * from t1 order by c1 desc limit 1;
 id	count	task	operator info
 Limit_10	1.00	root	offset:0, count:1
diff --git a/executor/analyze.go b/executor/analyze.go
index d9b08ece4fd15..0c172d38226a0 100644
--- a/executor/analyze.go
+++ b/executor/analyze.go
@@ -42,11 +42,12 @@ type AnalyzeExec struct {
 	tasks []*analyzeTask
 }
 
+var maxBucketSize = int64(256)
+
 const (
 	maxSampleSize        = 10000
 	maxRegionSampleSize  = 1000
 	maxSketchSize        = 10000
-	maxBucketSize        = 256
 	defaultCMSketchDepth = 5
 	defaultCMSketchWidth = 2048
 )
@@ -210,7 +211,7 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis
 		if err != nil {
 			return nil, nil, errors.Trace(err)
 		}
-		hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), maxBucketSize)
+		hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(maxBucketSize))
 		if err != nil {
 			return nil, nil, errors.Trace(err)
 		}
@@ -338,7 +339,7 @@ func (e *AnalyzeColumnsExec) buildStats() (hists []*statistics.Histogram, cms []
 		}
 		sc := e.ctx.GetSessionVars().StmtCtx
 		if e.pkInfo != nil {
-			pkHist, err = statistics.MergeHistograms(sc, pkHist, statistics.HistogramFromProto(resp.PkHist), maxBucketSize)
+			pkHist, err = statistics.MergeHistograms(sc, pkHist, statistics.HistogramFromProto(resp.PkHist), int(maxBucketSize))
 			if err != nil {
 				return nil, nil, errors.Trace(err)
 			}
@@ -373,3 +374,13 @@ func (e *AnalyzeColumnsExec) buildStats() (hists []*statistics.Histogram, cms []
 	}
 	return hists, cms, nil
 }
+
+// SetMaxBucketSizeForTest sets the `maxBucketSize`.
+func SetMaxBucketSizeForTest(size int64) {
+	maxBucketSize = size
+}
+
+// GetMaxBucketSizeForTest gets the `maxBucketSize`.
+func GetMaxBucketSizeForTest() int64 {
+	return maxBucketSize
+}
diff --git a/plan/cbo_test.go b/plan/cbo_test.go
index 5960960e12c15..3afcdafb25922 100644
--- a/plan/cbo_test.go
+++ b/plan/cbo_test.go
@@ -21,6 +21,7 @@ import (
 	. "github.com/pingcap/check"
 	"github.com/pingcap/tidb/config"
 	"github.com/pingcap/tidb/domain"
+	"github.com/pingcap/tidb/executor"
 	"github.com/pingcap/tidb/kv"
 	"github.com/pingcap/tidb/plan"
 	"github.com/pingcap/tidb/session"
@@ -620,6 +621,40 @@ func (s *testAnalyzeSuite) TestCorrelatedEstimation(c *C) {
 		))
 }
 
+func (s *testAnalyzeSuite) TestInconsistentEstimation(c *C) {
+	defer testleak.AfterTest(c)()
+	store, dom, err := newStoreWithBootstrap()
+	c.Assert(err, IsNil)
+	tk := testkit.NewTestKit(c, store)
+	defer func() {
+		dom.Close()
+		store.Close()
+	}()
+	tk.MustExec("use test")
+	tk.MustExec("create table t(a int, b int, c int, index ab(a,b), index ac(a,c))")
+	tk.MustExec("insert into t values (1,1,1), (1000,1000,1000)")
+	for i := 0; i < 10; i++ {
+		tk.MustExec("insert into t values (5,5,5), (10,10,10)")
+	}
+	origin := executor.GetMaxBucketSizeForTest()
+	defer func() { executor.SetMaxBucketSizeForTest(origin) }()
+	executor.SetMaxBucketSizeForTest(2)
+	tk.MustExec("analyze table t")
+	// Force using the histogram to estimate.
+	tk.MustExec("update mysql.stats_histograms set stats_ver = 0")
+	dom.StatsHandle().Clear()
+	dom.StatsHandle().Update(dom.InfoSchema())
+	// Using the histogram (a, b) to estimate `a = 5` will get 1.22, while using the CM Sketch to estimate
+	// the `a = 5 and c = 5` will get 10, it is not consistent.
+	tk.MustQuery("explain select * from t use index(ab) where a = 5 and c = 5").
+		Check(testkit.Rows(
+			"IndexLookUp_8 10.00 root ",
+			"├─IndexScan_5 12.50 cop table:t, index:a, b, range:[5,5], keep order:false",
+			"└─Selection_7 10.00 cop eq(test.t.c, 5)",
+			"  └─TableScan_6 12.50 cop table:t, keep order:false",
+		))
+}
+
 func newStoreWithBootstrap() (kv.Storage, *domain.Domain, error) {
 	store, err := mockstore.NewMockTikvStore()
 	if err != nil {
diff --git a/plan/logical_plans.go b/plan/logical_plans.go
index eabbd4ceeac47..cbc659de658e4 100644
--- a/plan/logical_plans.go
+++ b/plan/logical_plans.go
@@ -393,6 +393,11 @@ func (ds *DataSource) deriveTablePathStats(path *accessPath) (bool, error) {
 		return false, errors.Trace(err)
 	}
 	path.countAfterAccess, err = ds.statisticTable.GetRowCountByIntColumnRanges(sc, pkCol.ID, path.ranges)
+	// If the `countAfterAccess` is less than `stats.count`, there must be some inconsistent stats info.
+	// We prefer the `stats.count` because it could use more stats info to calculate the selectivity.
+	if path.countAfterAccess < ds.stats.count {
+		path.countAfterAccess = math.Min(ds.stats.count/selectionFactor, float64(ds.statisticTable.Count))
+	}
 	// Check whether the primary key is covered by point query.
 	noIntervalRange := true
 	for _, ran := range path.ranges {
@@ -443,6 +448,11 @@ func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) {
 			path.countAfterAccess = ds.statisticTable.PseudoAvgCountPerValue()
 		}
 	}
+	// If the `countAfterAccess` is less than `stats.count`, there must be some inconsistent stats info.
+	// We prefer the `stats.count` because it could use more stats info to calculate the selectivity.
+	if path.countAfterAccess < ds.stats.count {
+		path.countAfterAccess = math.Min(ds.stats.count/selectionFactor, float64(ds.statisticTable.Count))
+	}
 	if path.indexFilters != nil {
 		selectivity, err := ds.statisticTable.Selectivity(ds.ctx, path.indexFilters)
 		if err != nil {
diff --git a/statistics/selectivity.go b/statistics/selectivity.go
index a98bfe4d51efa..79260584adbba 100644
--- a/statistics/selectivity.go
+++ b/statistics/selectivity.go
@@ -35,6 +35,8 @@ type exprSet struct {
 	mask int64
 	// ranges contains all the ranges we got.
 	ranges []*ranger.Range
+	// numCols is the number of columns contained in the index or column(which is always 1).
+	numCols int
 }
 
 // The type of the exprSet.
@@ -177,7 +179,7 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
 			if err != nil {
 				return 0, errors.Trace(err)
 			}
-			sets = append(sets, &exprSet{tp: colType, ID: col.ID, mask: maskCovered, ranges: ranges})
+			sets = append(sets, &exprSet{tp: colType, ID: col.ID, mask: maskCovered, ranges: ranges, numCols: 1})
 			if mysql.HasPriKeyFlag(colInfo.Info.Flag) {
 				sets[len(sets)-1].tp = pkType
 			}
@@ -190,7 +192,7 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
 			if err != nil {
 				return 0, errors.Trace(err)
 			}
-			sets = append(sets, &exprSet{tp: indexType, ID: idxInfo.ID, mask: maskCovered, ranges: ranges})
+			sets = append(sets, &exprSet{tp: indexType, ID: idxInfo.ID, mask: maskCovered, ranges: ranges, numCols: len(idxInfo.Info.Columns)})
 		}
 	}
 	sets = getUsableSetsByGreedy(sets)
@@ -254,7 +256,7 @@ func getUsableSetsByGreedy(sets []*exprSet) (newBlocks []*exprSet) {
 	mask := int64(math.MaxInt64)
 	for {
 		// Choose the index that covers most.
-		bestID, bestCount, bestTp := -1, 0, colType
+		bestID, bestCount, bestTp, bestNumCols := -1, 0, colType, 0
 		for i, set := range sets {
 			set.mask &= mask
 			bits := popCount(set.mask)
@@ -262,8 +264,12 @@ func getUsableSetsByGreedy(sets []*exprSet) (newBlocks []*exprSet) {
 			if bits == 0 {
 				continue
 			}
-			if (bestTp == colType && set.tp < colType) || bestCount < bits {
-				bestID, bestCount, bestTp = i, bits, set.tp
+			// We greedy select the stats info based on:
+			// (1): The stats type, always prefer the primary key or index.
+			// (2): The number of expression that it covers, the more the better.
+			// (3): The number of columns that it contains, the less the better.
+			if (bestTp == colType && set.tp != colType) || bestCount < bits || (bestCount == bits && bestNumCols > set.numCols) {
+				bestID, bestCount, bestTp, bestNumCols = i, bits, set.tp, set.numCols
 			}
 		}
 		if bestCount == 0 {