Skip to content

Commit

Permalink
plan, stats: fix inconsistent row count estimation (#7233)
Browse files Browse the repository at this point in the history
  • Loading branch information
alivxxx authored and zz-jason committed Aug 6, 2018
1 parent a7fc9c3 commit 44e6c3c
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 23 deletions.
21 changes: 11 additions & 10 deletions cmd/explaintest/r/explain_easy.result
Original file line number Diff line number Diff line change
Expand Up @@ -112,13 +112,14 @@ MemTableScan_4 10000.00 root
explain select c2 = (select c2 from t2 where t1.c1 = t2.c1 order by c1 limit 1) from t1;
id count task operator info
Projection_12 10000.00 root eq(test.t1.c2, test.t2.c2)
└─Apply_14 10000.00 root left outer join, inner:Limit_22
└─Apply_14 10000.00 root left outer join, inner:Limit_21
├─TableReader_16 10000.00 root data:TableScan_15
│ └─TableScan_15 10000.00 cop table:t1, range:[-inf,+inf], keep order:false, stats:pseudo
└─Limit_22 1.00 root offset:0, count:1
└─IndexLookUp_47 0.00 root
├─IndexScan_45 0.00 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true, stats:pseudo
└─TableScan_46 0.00 cop table:t2, keep order:false, stats:pseudo
└─Limit_21 1.00 root offset:0, count:1
└─IndexLookUp_43 1.00 root
├─Limit_42 1.00 cop offset:0, count:1
│ └─IndexScan_40 1.25 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true, stats:pseudo
└─TableScan_41 1.00 cop table:t2, keep order:false
explain select * from t1 order by c1 desc limit 1;
id count task operator info
Limit_10 1.00 root offset:0, count:1
Expand Down Expand Up @@ -286,8 +287,8 @@ Projection_11 10000.00 root 9_aux_0
│ └─TableScan_14 10000.00 cop table:t, range:[-inf,+inf], keep order:false, stats:pseudo
└─StreamAgg_20 1.00 root funcs:count(1)
└─IndexJoin_32 10000.00 root inner join, inner:TableReader_31, outer key:s.a, inner key:t1.a
├─IndexReader_36 10.00 root index:IndexScan_35
│ └─IndexScan_35 10.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
├─IndexReader_36 10000.00 root index:IndexScan_35
│ └─IndexScan_35 10000.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
└─TableReader_31 10.00 root data:TableScan_30
└─TableScan_30 10.00 cop table:t1, range: decided by [s.a], keep order:false, stats:pseudo
explain select t.c in (select count(*) from t s use index(idx), t t1 where s.b = t.a and s.c = t1.a) from t;
Expand All @@ -298,9 +299,9 @@ Projection_11 10000.00 root 9_aux_0
│ └─TableScan_14 10000.00 cop table:t, range:[-inf,+inf], keep order:false, stats:pseudo
└─StreamAgg_20 1.00 root funcs:count(1)
└─IndexJoin_33 10000.00 root inner join, inner:TableReader_32, outer key:s.c, inner key:t1.a
├─IndexLookUp_38 10.00 root
│ ├─IndexScan_36 10.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
│ └─TableScan_37 10.00 cop table:t, keep order:false, stats:pseudo
├─IndexLookUp_38 10000.00 root
│ ├─IndexScan_36 10000.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
│ └─TableScan_37 10000.00 cop table:t, keep order:false, stats:pseudo
└─TableReader_32 10.00 root data:TableScan_31
└─TableScan_31 10.00 cop table:t1, range: decided by [s.c], keep order:false, stats:pseudo
drop table if exists t;
Expand Down
11 changes: 6 additions & 5 deletions cmd/explaintest/r/explain_easy_stats.result
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,14 @@ MemTableScan_4 10000.00 root
explain select c2 = (select c2 from t2 where t1.c1 = t2.c1 order by c1 limit 1) from t1;
id count task operator info
Projection_12 1999.00 root eq(test.t1.c2, test.t2.c2)
└─Apply_14 1999.00 root left outer join, inner:Limit_22
└─Apply_14 1999.00 root left outer join, inner:Limit_21
├─TableReader_16 1999.00 root data:TableScan_15
│ └─TableScan_15 1999.00 cop table:t1, range:[-inf,+inf], keep order:false
└─Limit_22 1.00 root offset:0, count:1
└─IndexLookUp_47 0.00 root
├─IndexScan_45 0.00 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true
└─TableScan_46 0.00 cop table:t2, keep order:false
└─Limit_21 1.00 root offset:0, count:1
└─IndexLookUp_43 1.00 root
├─Limit_42 1.00 cop offset:0, count:1
│ └─IndexScan_40 1.25 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true
└─TableScan_41 1.00 cop table:t2, keep order:false
explain select * from t1 order by c1 desc limit 1;
id count task operator info
Limit_10 1.00 root offset:0, count:1
Expand Down
17 changes: 14 additions & 3 deletions executor/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,12 @@ type AnalyzeExec struct {
tasks []*analyzeTask
}

var maxBucketSize = int64(256)

const (
maxSampleSize = 10000
maxRegionSampleSize = 1000
maxSketchSize = 10000
maxBucketSize = 256
defaultCMSketchDepth = 5
defaultCMSketchWidth = 2048
)
Expand Down Expand Up @@ -210,7 +211,7 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis
if err != nil {
return nil, nil, errors.Trace(err)
}
hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), maxBucketSize)
hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(maxBucketSize))
if err != nil {
return nil, nil, errors.Trace(err)
}
Expand Down Expand Up @@ -338,7 +339,7 @@ func (e *AnalyzeColumnsExec) buildStats() (hists []*statistics.Histogram, cms []
}
sc := e.ctx.GetSessionVars().StmtCtx
if e.pkInfo != nil {
pkHist, err = statistics.MergeHistograms(sc, pkHist, statistics.HistogramFromProto(resp.PkHist), maxBucketSize)
pkHist, err = statistics.MergeHistograms(sc, pkHist, statistics.HistogramFromProto(resp.PkHist), int(maxBucketSize))
if err != nil {
return nil, nil, errors.Trace(err)
}
Expand Down Expand Up @@ -373,3 +374,13 @@ func (e *AnalyzeColumnsExec) buildStats() (hists []*statistics.Histogram, cms []
}
return hists, cms, nil
}

// SetMaxBucketSizeForTest sets the `maxBucketSize`.
func SetMaxBucketSizeForTest(size int64) {
maxBucketSize = size
}

// GetMaxBucketSizeForTest gets the `maxBucketSize`.
func GetMaxBucketSizeForTest() int64 {
return maxBucketSize
}
35 changes: 35 additions & 0 deletions plan/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
. "github.com/pingcap/check"
"github.com/pingcap/tidb/config"
"github.com/pingcap/tidb/domain"
"github.com/pingcap/tidb/executor"
"github.com/pingcap/tidb/kv"
"github.com/pingcap/tidb/plan"
"github.com/pingcap/tidb/session"
Expand Down Expand Up @@ -620,6 +621,40 @@ func (s *testAnalyzeSuite) TestCorrelatedEstimation(c *C) {
))
}

func (s *testAnalyzeSuite) TestInconsistentEstimation(c *C) {
defer testleak.AfterTest(c)()
store, dom, err := newStoreWithBootstrap()
c.Assert(err, IsNil)
tk := testkit.NewTestKit(c, store)
defer func() {
dom.Close()
store.Close()
}()
tk.MustExec("use test")
tk.MustExec("create table t(a int, b int, c int, index ab(a,b), index ac(a,c))")
tk.MustExec("insert into t values (1,1,1), (1000,1000,1000)")
for i := 0; i < 10; i++ {
tk.MustExec("insert into t values (5,5,5), (10,10,10)")
}
origin := executor.GetMaxBucketSizeForTest()
defer func() { executor.SetMaxBucketSizeForTest(origin) }()
executor.SetMaxBucketSizeForTest(2)
tk.MustExec("analyze table t")
// Force using the histogram to estimate.
tk.MustExec("update mysql.stats_histograms set stats_ver = 0")
dom.StatsHandle().Clear()
dom.StatsHandle().Update(dom.InfoSchema())
// Using the histogram (a, b) to estimate `a = 5` will get 1.22, while using the CM Sketch to estimate
// the `a = 5 and c = 5` will get 10, it is not consistent.
tk.MustQuery("explain select * from t use index(ab) where a = 5 and c = 5").
Check(testkit.Rows(
"IndexLookUp_8 10.00 root ",
"├─IndexScan_5 12.50 cop table:t, index:a, b, range:[5,5], keep order:false",
"└─Selection_7 10.00 cop eq(test.t.c, 5)",
" └─TableScan_6 12.50 cop table:t, keep order:false",
))
}

func newStoreWithBootstrap() (kv.Storage, *domain.Domain, error) {
store, err := mockstore.NewMockTikvStore()
if err != nil {
Expand Down
10 changes: 10 additions & 0 deletions plan/logical_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,11 @@ func (ds *DataSource) deriveTablePathStats(path *accessPath) (bool, error) {
return false, errors.Trace(err)
}
path.countAfterAccess, err = ds.statisticTable.GetRowCountByIntColumnRanges(sc, pkCol.ID, path.ranges)
// If the `countAfterAccess` is less than `stats.count`, there must be some inconsistent stats info.
// We prefer the `stats.count` because it could use more stats info to calculate the selectivity.
if path.countAfterAccess < ds.stats.count {
path.countAfterAccess = math.Min(ds.stats.count/selectionFactor, float64(ds.statisticTable.Count))
}
// Check whether the primary key is covered by point query.
noIntervalRange := true
for _, ran := range path.ranges {
Expand Down Expand Up @@ -443,6 +448,11 @@ func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) {
path.countAfterAccess = ds.statisticTable.PseudoAvgCountPerValue()
}
}
// If the `countAfterAccess` is less than `stats.count`, there must be some inconsistent stats info.
// We prefer the `stats.count` because it could use more stats info to calculate the selectivity.
if path.countAfterAccess < ds.stats.count {
path.countAfterAccess = math.Min(ds.stats.count/selectionFactor, float64(ds.statisticTable.Count))
}
if path.indexFilters != nil {
selectivity, err := ds.statisticTable.Selectivity(ds.ctx, path.indexFilters)
if err != nil {
Expand Down
16 changes: 11 additions & 5 deletions statistics/selectivity.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ type exprSet struct {
mask int64
// ranges contains all the ranges we got.
ranges []*ranger.Range
// numCols is the number of columns contained in the index or column(which is always 1).
numCols int
}

// The type of the exprSet.
Expand Down Expand Up @@ -177,7 +179,7 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
if err != nil {
return 0, errors.Trace(err)
}
sets = append(sets, &exprSet{tp: colType, ID: col.ID, mask: maskCovered, ranges: ranges})
sets = append(sets, &exprSet{tp: colType, ID: col.ID, mask: maskCovered, ranges: ranges, numCols: 1})
if mysql.HasPriKeyFlag(colInfo.Info.Flag) {
sets[len(sets)-1].tp = pkType
}
Expand All @@ -190,7 +192,7 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
if err != nil {
return 0, errors.Trace(err)
}
sets = append(sets, &exprSet{tp: indexType, ID: idxInfo.ID, mask: maskCovered, ranges: ranges})
sets = append(sets, &exprSet{tp: indexType, ID: idxInfo.ID, mask: maskCovered, ranges: ranges, numCols: len(idxInfo.Info.Columns)})
}
}
sets = getUsableSetsByGreedy(sets)
Expand Down Expand Up @@ -254,16 +256,20 @@ func getUsableSetsByGreedy(sets []*exprSet) (newBlocks []*exprSet) {
mask := int64(math.MaxInt64)
for {
// Choose the index that covers most.
bestID, bestCount, bestTp := -1, 0, colType
bestID, bestCount, bestTp, bestNumCols := -1, 0, colType, 0
for i, set := range sets {
set.mask &= mask
bits := popCount(set.mask)
// This set cannot cover any thing, just skip it.
if bits == 0 {
continue
}
if (bestTp == colType && set.tp < colType) || bestCount < bits {
bestID, bestCount, bestTp = i, bits, set.tp
// We greedy select the stats info based on:
// (1): The stats type, always prefer the primary key or index.
// (2): The number of expression that it covers, the more the better.
// (3): The number of columns that it contains, the less the better.
if (bestTp == colType && set.tp != colType) || bestCount < bits || (bestCount == bits && bestNumCols > set.numCols) {
bestID, bestCount, bestTp, bestNumCols = i, bits, set.tp, set.numCols
}
}
if bestCount == 0 {
Expand Down

0 comments on commit 44e6c3c

Please sign in to comment.