From 44e6c3c2e110238dd9ff834a24ec3e2cb0dc6062 Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Mon, 6 Aug 2018 17:39:33 +0800 Subject: [PATCH] plan, stats: fix inconsistent row count estimation (#7233) --- cmd/explaintest/r/explain_easy.result | 21 +++++++------ cmd/explaintest/r/explain_easy_stats.result | 11 ++++--- executor/analyze.go | 17 ++++++++-- plan/cbo_test.go | 35 +++++++++++++++++++++ plan/logical_plans.go | 10 ++++++ statistics/selectivity.go | 16 +++++++--- 6 files changed, 87 insertions(+), 23 deletions(-) diff --git a/cmd/explaintest/r/explain_easy.result b/cmd/explaintest/r/explain_easy.result index 51cdef794fe6b..2e096f0e4cfc1 100644 --- a/cmd/explaintest/r/explain_easy.result +++ b/cmd/explaintest/r/explain_easy.result @@ -112,13 +112,14 @@ MemTableScan_4 10000.00 root explain select c2 = (select c2 from t2 where t1.c1 = t2.c1 order by c1 limit 1) from t1; id count task operator info Projection_12 10000.00 root eq(test.t1.c2, test.t2.c2) -└─Apply_14 10000.00 root left outer join, inner:Limit_22 +└─Apply_14 10000.00 root left outer join, inner:Limit_21 ├─TableReader_16 10000.00 root data:TableScan_15 │ └─TableScan_15 10000.00 cop table:t1, range:[-inf,+inf], keep order:false, stats:pseudo - └─Limit_22 1.00 root offset:0, count:1 - └─IndexLookUp_47 0.00 root - ├─IndexScan_45 0.00 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true, stats:pseudo - └─TableScan_46 0.00 cop table:t2, keep order:false, stats:pseudo + └─Limit_21 1.00 root offset:0, count:1 + └─IndexLookUp_43 1.00 root + ├─Limit_42 1.00 cop offset:0, count:1 + │ └─IndexScan_40 1.25 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true, stats:pseudo + └─TableScan_41 1.00 cop table:t2, keep order:false explain select * from t1 order by c1 desc limit 1; id count task operator info Limit_10 1.00 root offset:0, count:1 @@ -286,8 +287,8 @@ Projection_11 10000.00 root 9_aux_0 │ └─TableScan_14 10000.00 cop table:t, range:[-inf,+inf], keep order:false, stats:pseudo └─StreamAgg_20 1.00 root funcs:count(1) └─IndexJoin_32 10000.00 root inner join, inner:TableReader_31, outer key:s.a, inner key:t1.a - ├─IndexReader_36 10.00 root index:IndexScan_35 - │ └─IndexScan_35 10.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo + ├─IndexReader_36 10000.00 root index:IndexScan_35 + │ └─IndexScan_35 10000.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo └─TableReader_31 10.00 root data:TableScan_30 └─TableScan_30 10.00 cop table:t1, range: decided by [s.a], keep order:false, stats:pseudo explain select t.c in (select count(*) from t s use index(idx), t t1 where s.b = t.a and s.c = t1.a) from t; @@ -298,9 +299,9 @@ Projection_11 10000.00 root 9_aux_0 │ └─TableScan_14 10000.00 cop table:t, range:[-inf,+inf], keep order:false, stats:pseudo └─StreamAgg_20 1.00 root funcs:count(1) └─IndexJoin_33 10000.00 root inner join, inner:TableReader_32, outer key:s.c, inner key:t1.a - ├─IndexLookUp_38 10.00 root - │ ├─IndexScan_36 10.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo - │ └─TableScan_37 10.00 cop table:t, keep order:false, stats:pseudo + ├─IndexLookUp_38 10000.00 root + │ ├─IndexScan_36 10000.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo + │ └─TableScan_37 10000.00 cop table:t, keep order:false, stats:pseudo └─TableReader_32 10.00 root data:TableScan_31 └─TableScan_31 10.00 cop table:t1, range: decided by [s.c], keep order:false, stats:pseudo drop table if exists t; diff --git a/cmd/explaintest/r/explain_easy_stats.result b/cmd/explaintest/r/explain_easy_stats.result index 3e314cb6c1054..740fa7b9fea4b 100644 --- a/cmd/explaintest/r/explain_easy_stats.result +++ b/cmd/explaintest/r/explain_easy_stats.result @@ -101,13 +101,14 @@ MemTableScan_4 10000.00 root explain select c2 = (select c2 from t2 where t1.c1 = t2.c1 order by c1 limit 1) from t1; id count task operator info Projection_12 1999.00 root eq(test.t1.c2, test.t2.c2) -└─Apply_14 1999.00 root left outer join, inner:Limit_22 +└─Apply_14 1999.00 root left outer join, inner:Limit_21 ├─TableReader_16 1999.00 root data:TableScan_15 │ └─TableScan_15 1999.00 cop table:t1, range:[-inf,+inf], keep order:false - └─Limit_22 1.00 root offset:0, count:1 - └─IndexLookUp_47 0.00 root - ├─IndexScan_45 0.00 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true - └─TableScan_46 0.00 cop table:t2, keep order:false + └─Limit_21 1.00 root offset:0, count:1 + └─IndexLookUp_43 1.00 root + ├─Limit_42 1.00 cop offset:0, count:1 + │ └─IndexScan_40 1.25 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true + └─TableScan_41 1.00 cop table:t2, keep order:false explain select * from t1 order by c1 desc limit 1; id count task operator info Limit_10 1.00 root offset:0, count:1 diff --git a/executor/analyze.go b/executor/analyze.go index d9b08ece4fd15..0c172d38226a0 100644 --- a/executor/analyze.go +++ b/executor/analyze.go @@ -42,11 +42,12 @@ type AnalyzeExec struct { tasks []*analyzeTask } +var maxBucketSize = int64(256) + const ( maxSampleSize = 10000 maxRegionSampleSize = 1000 maxSketchSize = 10000 - maxBucketSize = 256 defaultCMSketchDepth = 5 defaultCMSketchWidth = 2048 ) @@ -210,7 +211,7 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis if err != nil { return nil, nil, errors.Trace(err) } - hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), maxBucketSize) + hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(maxBucketSize)) if err != nil { return nil, nil, errors.Trace(err) } @@ -338,7 +339,7 @@ func (e *AnalyzeColumnsExec) buildStats() (hists []*statistics.Histogram, cms [] } sc := e.ctx.GetSessionVars().StmtCtx if e.pkInfo != nil { - pkHist, err = statistics.MergeHistograms(sc, pkHist, statistics.HistogramFromProto(resp.PkHist), maxBucketSize) + pkHist, err = statistics.MergeHistograms(sc, pkHist, statistics.HistogramFromProto(resp.PkHist), int(maxBucketSize)) if err != nil { return nil, nil, errors.Trace(err) } @@ -373,3 +374,13 @@ func (e *AnalyzeColumnsExec) buildStats() (hists []*statistics.Histogram, cms [] } return hists, cms, nil } + +// SetMaxBucketSizeForTest sets the `maxBucketSize`. +func SetMaxBucketSizeForTest(size int64) { + maxBucketSize = size +} + +// GetMaxBucketSizeForTest gets the `maxBucketSize`. +func GetMaxBucketSizeForTest() int64 { + return maxBucketSize +} diff --git a/plan/cbo_test.go b/plan/cbo_test.go index 5960960e12c15..3afcdafb25922 100644 --- a/plan/cbo_test.go +++ b/plan/cbo_test.go @@ -21,6 +21,7 @@ import ( . "github.com/pingcap/check" "github.com/pingcap/tidb/config" "github.com/pingcap/tidb/domain" + "github.com/pingcap/tidb/executor" "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/plan" "github.com/pingcap/tidb/session" @@ -620,6 +621,40 @@ func (s *testAnalyzeSuite) TestCorrelatedEstimation(c *C) { )) } +func (s *testAnalyzeSuite) TestInconsistentEstimation(c *C) { + defer testleak.AfterTest(c)() + store, dom, err := newStoreWithBootstrap() + c.Assert(err, IsNil) + tk := testkit.NewTestKit(c, store) + defer func() { + dom.Close() + store.Close() + }() + tk.MustExec("use test") + tk.MustExec("create table t(a int, b int, c int, index ab(a,b), index ac(a,c))") + tk.MustExec("insert into t values (1,1,1), (1000,1000,1000)") + for i := 0; i < 10; i++ { + tk.MustExec("insert into t values (5,5,5), (10,10,10)") + } + origin := executor.GetMaxBucketSizeForTest() + defer func() { executor.SetMaxBucketSizeForTest(origin) }() + executor.SetMaxBucketSizeForTest(2) + tk.MustExec("analyze table t") + // Force using the histogram to estimate. + tk.MustExec("update mysql.stats_histograms set stats_ver = 0") + dom.StatsHandle().Clear() + dom.StatsHandle().Update(dom.InfoSchema()) + // Using the histogram (a, b) to estimate `a = 5` will get 1.22, while using the CM Sketch to estimate + // the `a = 5 and c = 5` will get 10, it is not consistent. + tk.MustQuery("explain select * from t use index(ab) where a = 5 and c = 5"). + Check(testkit.Rows( + "IndexLookUp_8 10.00 root ", + "├─IndexScan_5 12.50 cop table:t, index:a, b, range:[5,5], keep order:false", + "└─Selection_7 10.00 cop eq(test.t.c, 5)", + " └─TableScan_6 12.50 cop table:t, keep order:false", + )) +} + func newStoreWithBootstrap() (kv.Storage, *domain.Domain, error) { store, err := mockstore.NewMockTikvStore() if err != nil { diff --git a/plan/logical_plans.go b/plan/logical_plans.go index eabbd4ceeac47..cbc659de658e4 100644 --- a/plan/logical_plans.go +++ b/plan/logical_plans.go @@ -393,6 +393,11 @@ func (ds *DataSource) deriveTablePathStats(path *accessPath) (bool, error) { return false, errors.Trace(err) } path.countAfterAccess, err = ds.statisticTable.GetRowCountByIntColumnRanges(sc, pkCol.ID, path.ranges) + // If the `countAfterAccess` is less than `stats.count`, there must be some inconsistent stats info. + // We prefer the `stats.count` because it could use more stats info to calculate the selectivity. + if path.countAfterAccess < ds.stats.count { + path.countAfterAccess = math.Min(ds.stats.count/selectionFactor, float64(ds.statisticTable.Count)) + } // Check whether the primary key is covered by point query. noIntervalRange := true for _, ran := range path.ranges { @@ -443,6 +448,11 @@ func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) { path.countAfterAccess = ds.statisticTable.PseudoAvgCountPerValue() } } + // If the `countAfterAccess` is less than `stats.count`, there must be some inconsistent stats info. + // We prefer the `stats.count` because it could use more stats info to calculate the selectivity. + if path.countAfterAccess < ds.stats.count { + path.countAfterAccess = math.Min(ds.stats.count/selectionFactor, float64(ds.statisticTable.Count)) + } if path.indexFilters != nil { selectivity, err := ds.statisticTable.Selectivity(ds.ctx, path.indexFilters) if err != nil { diff --git a/statistics/selectivity.go b/statistics/selectivity.go index a98bfe4d51efa..79260584adbba 100644 --- a/statistics/selectivity.go +++ b/statistics/selectivity.go @@ -35,6 +35,8 @@ type exprSet struct { mask int64 // ranges contains all the ranges we got. ranges []*ranger.Range + // numCols is the number of columns contained in the index or column(which is always 1). + numCols int } // The type of the exprSet. @@ -177,7 +179,7 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp if err != nil { return 0, errors.Trace(err) } - sets = append(sets, &exprSet{tp: colType, ID: col.ID, mask: maskCovered, ranges: ranges}) + sets = append(sets, &exprSet{tp: colType, ID: col.ID, mask: maskCovered, ranges: ranges, numCols: 1}) if mysql.HasPriKeyFlag(colInfo.Info.Flag) { sets[len(sets)-1].tp = pkType } @@ -190,7 +192,7 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp if err != nil { return 0, errors.Trace(err) } - sets = append(sets, &exprSet{tp: indexType, ID: idxInfo.ID, mask: maskCovered, ranges: ranges}) + sets = append(sets, &exprSet{tp: indexType, ID: idxInfo.ID, mask: maskCovered, ranges: ranges, numCols: len(idxInfo.Info.Columns)}) } } sets = getUsableSetsByGreedy(sets) @@ -254,7 +256,7 @@ func getUsableSetsByGreedy(sets []*exprSet) (newBlocks []*exprSet) { mask := int64(math.MaxInt64) for { // Choose the index that covers most. - bestID, bestCount, bestTp := -1, 0, colType + bestID, bestCount, bestTp, bestNumCols := -1, 0, colType, 0 for i, set := range sets { set.mask &= mask bits := popCount(set.mask) @@ -262,8 +264,12 @@ func getUsableSetsByGreedy(sets []*exprSet) (newBlocks []*exprSet) { if bits == 0 { continue } - if (bestTp == colType && set.tp < colType) || bestCount < bits { - bestID, bestCount, bestTp = i, bits, set.tp + // We greedy select the stats info based on: + // (1): The stats type, always prefer the primary key or index. + // (2): The number of expression that it covers, the more the better. + // (3): The number of columns that it contains, the less the better. + if (bestTp == colType && set.tp != colType) || bestCount < bits || (bestCount == bits && bestNumCols > set.numCols) { + bestID, bestCount, bestTp, bestNumCols = i, bits, set.tp, set.numCols } } if bestCount == 0 {