Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats: refine the row count estimation for unknown values #7175

Merged
merged 1 commit into from
Jul 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions cmd/explaintest/r/explain_complex_stats.result
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,11 @@ Projection_5 39.28 root test.st.cm, test.st.p1, test.st.p2, test.st.p3, test.st.
└─TableScan_14 160.23 cop table:st, keep order:false
explain select dt.id as id, dt.aid as aid, dt.pt as pt, dt.dic as dic, dt.cm as cm, rr.gid as gid, rr.acd as acd, rr.t as t,dt.p1 as p1, dt.p2 as p2, dt.p3 as p3, dt.p4 as p4, dt.p5 as p5, dt.p6_md5 as p6, dt.p7_md5 as p7 from dt dt join rr rr on (rr.pt = 'ios' and rr.t > 1478185592 and dt.aid = rr.aid and dt.dic = rr.dic) where dt.pt = 'ios' and dt.t > 1478185592 and dt.bm = 0 limit 2000;
id count task operator info
Projection_9 428.32 root dt.id, dt.aid, dt.pt, dt.dic, dt.cm, rr.gid, rr.acd, rr.t, dt.p1, dt.p2, dt.p3, dt.p4, dt.p5, dt.p6_md5, dt.p7_md5
└─Limit_12 428.32 root offset:0, count:2000
└─IndexJoin_18 428.32 root inner join, inner:IndexLookUp_17, outer key:dt.aid, dt.dic, inner key:rr.aid, rr.dic
├─TableReader_42 428.32 root data:Selection_41
│ └─Selection_41 428.32 cop eq(dt.pt, "ios"), gt(dt.t, 1478185592), eq(dt.bm, 0)
Projection_9 428.55 root dt.id, dt.aid, dt.pt, dt.dic, dt.cm, rr.gid, rr.acd, rr.t, dt.p1, dt.p2, dt.p3, dt.p4, dt.p5, dt.p6_md5, dt.p7_md5
└─Limit_12 428.55 root offset:0, count:2000
└─IndexJoin_18 428.55 root inner join, inner:IndexLookUp_17, outer key:dt.aid, dt.dic, inner key:rr.aid, rr.dic
├─TableReader_42 428.55 root data:Selection_41
│ └─Selection_41 428.55 cop eq(dt.pt, "ios"), gt(dt.t, 1478185592), eq(dt.bm, 0)
│ └─TableScan_40 2000.00 cop table:dt, range:[0,+inf], keep order:false
└─IndexLookUp_17 970.00 root
├─IndexScan_14 1.00 cop table:rr, index:aid, dic, range: decided by [dt.aid dt.dic], keep order:false
Expand Down
6 changes: 3 additions & 3 deletions cmd/explaintest/r/explain_easy_stats.result
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ explain select * from t1 left join t2 on t1.c2 = t2.c1 where t1.c1 > 1;
id count task operator info
Projection_6 2481.25 root test.t1.c1, test.t1.c2, test.t1.c3, test.t2.c1, test.t2.c2
└─MergeJoin_7 2481.25 root left outer join, left key:test.t1.c2, right key:test.t2.c1
├─IndexLookUp_17 1998.00 root
│ ├─Selection_16 1998.00 cop gt(test.t1.c1, 1)
├─IndexLookUp_17 1999.00 root
│ ├─Selection_16 1999.00 cop gt(test.t1.c1, 1)
│ │ └─IndexScan_14 1999.00 cop table:t1, index:c2, range:[NULL,+inf], keep order:true
│ └─TableScan_15 1998.00 cop table:t1, keep order:false
│ └─TableScan_15 1999.00 cop table:t1, keep order:false
└─IndexLookUp_21 1985.00 root
├─IndexScan_19 1985.00 cop table:t2, index:c1, range:[NULL,+inf], keep order:true
└─TableScan_20 1985.00 cop table:t2, keep order:false
Expand Down
4 changes: 2 additions & 2 deletions plan/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -496,8 +496,8 @@ func (s *testAnalyzeSuite) TestOutdatedAnalyze(c *C) {
c.Assert(h.Update(dom.InfoSchema()), IsNil)
statistics.RatioOfPseudoEstimate = 10.0
testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows(
"TableReader_7 28.80 root data:Selection_6",
"└─Selection_6 28.80 cop le(test.t.a, 5), le(test.t.b, 5)",
"TableReader_7 35.91 root data:Selection_6",
"└─Selection_6 35.91 cop le(test.t.a, 5), le(test.t.b, 5)",
" └─TableScan_5 80.00 cop table:t, range:[-inf,+inf], keep order:false",
))
statistics.RatioOfPseudoEstimate = 0.7
Expand Down
6 changes: 3 additions & 3 deletions statistics/boostrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ func initStatsMeta4Chunk(is infoschema.InfoSchema, tables statsCache, iter *chun
TableID: tableInfo.ID,
HaveTblID: true,
Count: row.GetInt64(3),
ModifyCount: row.GetInt64(2),
Columns: make(map[int64]*Column, len(tableInfo.Columns)),
Indices: make(map[int64]*Index, len(tableInfo.Indices)),
colName2Idx: make(map[string]int64, len(tableInfo.Columns)),
colName2ID: make(map[string]int64, len(tableInfo.Columns)),
}
tbl := &Table{
HistColl: newHistColl,
ModifyCount: row.GetInt64(2),
Version: row.GetUint64(0),
HistColl: newHistColl,
Version: row.GetUint64(0),
}
tables[tableID] = tbl
}
Expand Down
2 changes: 1 addition & 1 deletion statistics/ddl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func (s *testStatsCacheSuite) TestDDLHistogram(c *C) {
c.Assert(count, Equals, float64(2))
count, err = statsTbl.ColumnEqualRowCount(sc, types.NewIntDatum(1), tableInfo.Columns[3].ID)
c.Assert(err, IsNil)
c.Assert(count, Equals, float64(0))
c.Assert(count, Equals, float64(2))

testKit.MustExec("alter table t add column c4 datetime NOT NULL default CURRENT_TIMESTAMP")
err = h.HandleDDLEvent(<-h.DDLEventCh())
Expand Down
14 changes: 7 additions & 7 deletions statistics/dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,15 +122,15 @@ func (h *Handle) LoadStatsFromJSON(is infoschema.InfoSchema, jsonTbl *JSONTable)
// LoadStatsFromJSONToTable load statistic from JSONTable and return the Table of statistic.
func (h *Handle) LoadStatsFromJSONToTable(tableInfo *model.TableInfo, jsonTbl *JSONTable) (*Table, error) {
newHistColl := HistColl{
TableID: tableInfo.ID,
HaveTblID: true,
Count: jsonTbl.Count,
Columns: make(map[int64]*Column, len(jsonTbl.Columns)),
Indices: make(map[int64]*Index, len(jsonTbl.Indices)),
TableID: tableInfo.ID,
HaveTblID: true,
Count: jsonTbl.Count,
ModifyCount: jsonTbl.ModifyCount,
Columns: make(map[int64]*Column, len(jsonTbl.Columns)),
Indices: make(map[int64]*Index, len(jsonTbl.Indices)),
}
tbl := &Table{
HistColl: newHistColl,
ModifyCount: jsonTbl.ModifyCount,
HistColl: newHistColl,
}
for id, jsonIdx := range jsonTbl.Indices {
for _, idxInfo := range tableInfo.Indices {
Expand Down
4 changes: 2 additions & 2 deletions statistics/feedback.go
Original file line number Diff line number Diff line change
Expand Up @@ -714,11 +714,11 @@ func (q *QueryFeedback) recalculateExpectCount(h *Handle) error {
expected := 0.0
if isIndex {
idx := t.Indices[id]
expected, err = idx.getRowCount(sc, ranges)
expected, err = idx.getRowCount(sc, ranges, t.ModifyCount)
expected *= idx.getIncreaseFactor(t.Count)
} else {
c := t.Columns[id]
expected, err = c.getColumnRowCount(sc, ranges)
expected, err = c.getColumnRowCount(sc, ranges, t.ModifyCount)
expected *= c.getIncreaseFactor(t.Count)
}
if err != nil {
Expand Down
36 changes: 29 additions & 7 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,15 @@ func (hg *Histogram) AvgCountPerValue(totalCount int64) float64 {
return float64(totalCount) / curNDV
}

func (hg *Histogram) outOfRange(val types.Datum) bool {
if hg.Bounds == nil {
return true
}
len := hg.Bounds.NumRows()
return chunk.Compare(hg.Bounds.GetRow(0), 0, &val) > 0 ||
chunk.Compare(hg.Bounds.GetRow(len-1), 0, &val) < 0
}

// ErrorRate is the error rate of estimate row count by bucket and cm sketch.
type ErrorRate struct {
ErrorTotal float64
Expand Down Expand Up @@ -716,19 +725,22 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (f
if val.IsNull() {
return float64(c.NullCount), nil
}
if c.CMSketch != nil {
count, err := c.CMSketch.queryValue(sc, val)
return float64(count), errors.Trace(err)
}
// all the values is null
if c.Histogram.Bounds == nil {
return 0.0, nil
}
if c.NDV > 0 && c.outOfRange(val) {
return c.totalRowCount() / (float64(c.NDV)), nil
}
if c.CMSketch != nil {
count, err := c.CMSketch.queryValue(sc, val)
return float64(count), errors.Trace(err)
}
return c.Histogram.equalRowCount(val), nil
}

// getColumnRowCount estimates the row count by a slice of Range.
func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range) (float64, error) {
func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range, modifyCount int64) (float64, error) {
var rowCount float64
for _, rg := range ranges {
cmp, err := rg.LowVal[0].CompareDatum(sc, &rg.HighVal[0])
Expand All @@ -749,6 +761,9 @@ func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
}
// the interval case.
cnt := c.betweenRowCount(rg.LowVal[0], rg.HighVal[0])
if c.outOfRange(rg.LowVal[0]) || c.outOfRange(rg.HighVal[0]) {
cnt += float64(modifyCount) / outOfRangeBetweenRate
}
if rg.LowExclude {
lowCnt, err := c.equalRowCount(sc, rg.LowVal[0])
if err != nil {
Expand Down Expand Up @@ -787,13 +802,17 @@ func (idx *Index) String() string {
}

func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte) float64 {
val := types.NewBytesDatum(b)
if idx.NDV > 0 && idx.outOfRange(val) {
return idx.totalRowCount() / (float64(idx.NDV))
}
if idx.CMSketch != nil {
return float64(idx.CMSketch.QueryBytes(b))
}
return idx.Histogram.equalRowCount(types.NewBytesDatum(b))
return idx.Histogram.equalRowCount(val)
}

func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range) (float64, error) {
func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range, modifyCount int64) (float64, error) {
totalCount := float64(0)
for _, indexRange := range indexRanges {
lb, err := codec.EncodeKey(sc, nil, indexRange.LowVal...)
Expand All @@ -820,6 +839,9 @@ func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*range
l := types.NewBytesDatum(lb)
r := types.NewBytesDatum(rb)
totalCount += idx.betweenRowCount(l, r)
if idx.outOfRange(l) || idx.outOfRange(r) {
totalCount += float64(modifyCount) / outOfRangeBetweenRate
}
}
if totalCount > idx.totalRowCount() {
totalCount = idx.totalRowCount()
Expand Down
58 changes: 56 additions & 2 deletions statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/pingcap/tidb/statistics"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/codec"
"github.com/pingcap/tidb/util/ranger"
"github.com/pingcap/tidb/util/testkit"
)

Expand Down Expand Up @@ -157,7 +158,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) {
},
{
exprs: "a >= 1 and b > 1 and a < 2",
selectivity: 0.01783264746,
selectivity: 0.01817558299,
},
{
exprs: "a >= 1 and c > 1 and a < 2",
Expand All @@ -173,7 +174,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) {
},
{
exprs: "b > 1",
selectivity: 0.96296296296,
selectivity: 0.98148148148,
},
{
exprs: "a > 1 and b < 2 and c > 3 and d < 4 and e > 5",
Expand Down Expand Up @@ -238,6 +239,59 @@ func (s *testSelectivitySuite) TestDiscreteDistribution(c *C) {
"└─IndexScan_8 0.00 cop table:t, index:a, b, range:[\"tw\" -inf,\"tw\" 0), keep order:false"))
}

func getRange(start, end int64) []*ranger.Range {
ran := &ranger.Range{
LowVal: []types.Datum{types.NewIntDatum(start)},
HighVal: []types.Datum{types.NewIntDatum(end)},
}
return []*ranger.Range{ran}
}

func (s *testSelectivitySuite) TestEstimationForUnknownValues(c *C) {
testKit := testkit.NewTestKit(c, s.store)
testKit.MustExec("use test")
testKit.MustExec("drop table if exists t")
testKit.MustExec("create table t(a int, b int, key idx(a, b))")
testKit.MustExec("analyze table t")
for i := 0; i < 10; i++ {
testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i))
}
h := s.dom.StatsHandle()
h.DumpStatsDeltaToKV(statistics.DumpAll)
testKit.MustExec("analyze table t")
for i := 0; i < 10; i++ {
testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i+10, i+10))
}
h.DumpStatsDeltaToKV(statistics.DumpAll)
c.Assert(h.Update(s.dom.InfoSchema()), IsNil)
table, err := s.dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
c.Assert(err, IsNil)
statsTbl := h.GetTableStats(table.Meta())

sc := &stmtctx.StatementContext{}
colID := table.Meta().Columns[0].ID
count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 2.0)

count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 4.2)

count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64))
c.Assert(err, IsNil)
c.Assert(count, Equals, 4.2)

idxID := table.Meta().Indices[0].ID
count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 0.2)

count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(9, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 2.2)
}

func BenchmarkSelectivity(b *testing.B) {
c := &C{}
s := &testSelectivitySuite{}
Expand Down
Loading