pingcap · coocood · Jul 29, 2018 · Jul 27, 2018
diff --git a/cmd/explaintest/r/explain_complex_stats.result b/cmd/explaintest/r/explain_complex_stats.result
@@ -158,11 +158,11 @@ Projection_5	39.28	root	test.st.cm, test.st.p1, test.st.p2, test.st.p3, test.st.
       └─TableScan_14	160.23	cop	table:st, keep order:false
 explain select dt.id as id, dt.aid as aid, dt.pt as pt, dt.dic as dic, dt.cm as cm, rr.gid as gid, rr.acd as acd, rr.t as t,dt.p1 as p1, dt.p2 as p2, dt.p3 as p3, dt.p4 as p4, dt.p5 as p5, dt.p6_md5 as p6, dt.p7_md5 as p7 from dt dt join rr rr on (rr.pt = 'ios' and rr.t > 1478185592 and dt.aid = rr.aid and dt.dic = rr.dic) where dt.pt = 'ios' and dt.t > 1478185592 and dt.bm = 0 limit 2000;
 id	count	task	operator info
-Projection_9	428.32	root	dt.id, dt.aid, dt.pt, dt.dic, dt.cm, rr.gid, rr.acd, rr.t, dt.p1, dt.p2, dt.p3, dt.p4, dt.p5, dt.p6_md5, dt.p7_md5
-└─Limit_12	428.32	root	offset:0, count:2000
-  └─IndexJoin_18	428.32	root	inner join, inner:IndexLookUp_17, outer key:dt.aid, dt.dic, inner key:rr.aid, rr.dic
-    ├─TableReader_42	428.32	root	data:Selection_41
-    │ └─Selection_41	428.32	cop	eq(dt.pt, "ios"), gt(dt.t, 1478185592), eq(dt.bm, 0)
+Projection_9	428.55	root	dt.id, dt.aid, dt.pt, dt.dic, dt.cm, rr.gid, rr.acd, rr.t, dt.p1, dt.p2, dt.p3, dt.p4, dt.p5, dt.p6_md5, dt.p7_md5
+└─Limit_12	428.55	root	offset:0, count:2000
+  └─IndexJoin_18	428.55	root	inner join, inner:IndexLookUp_17, outer key:dt.aid, dt.dic, inner key:rr.aid, rr.dic
+    ├─TableReader_42	428.55	root	data:Selection_41
+    │ └─Selection_41	428.55	cop	eq(dt.pt, "ios"), gt(dt.t, 1478185592), eq(dt.bm, 0)
     │   └─TableScan_40	2000.00	cop	table:dt, range:[0,+inf], keep order:false
     └─IndexLookUp_17	970.00	root	
       ├─IndexScan_14	1.00	cop	table:rr, index:aid, dic, range: decided by [dt.aid dt.dic], keep order:false

diff --git a/cmd/explaintest/r/explain_easy_stats.result b/cmd/explaintest/r/explain_easy_stats.result
@@ -47,10 +47,10 @@ explain select * from t1 left join t2 on t1.c2 = t2.c1 where t1.c1 > 1;
 id	count	task	operator info
 Projection_6	2481.25	root	test.t1.c1, test.t1.c2, test.t1.c3, test.t2.c1, test.t2.c2
 └─MergeJoin_7	2481.25	root	left outer join, left key:test.t1.c2, right key:test.t2.c1
-  ├─IndexLookUp_17	1998.00	root	
-  │ ├─Selection_16	1998.00	cop	gt(test.t1.c1, 1)
+  ├─IndexLookUp_17	1999.00	root	
+  │ ├─Selection_16	1999.00	cop	gt(test.t1.c1, 1)
   │ │ └─IndexScan_14	1999.00	cop	table:t1, index:c2, range:[NULL,+inf], keep order:true
-  │ └─TableScan_15	1998.00	cop	table:t1, keep order:false
+  │ └─TableScan_15	1999.00	cop	table:t1, keep order:false
   └─IndexLookUp_21	1985.00	root	
     ├─IndexScan_19	1985.00	cop	table:t2, index:c1, range:[NULL,+inf], keep order:true
     └─TableScan_20	1985.00	cop	table:t2, keep order:false

diff --git a/plan/cbo_test.go b/plan/cbo_test.go
@@ -496,8 +496,8 @@ func (s *testAnalyzeSuite) TestOutdatedAnalyze(c *C) {
 	c.Assert(h.Update(dom.InfoSchema()), IsNil)
 	statistics.RatioOfPseudoEstimate = 10.0
 	testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows(
-		"TableReader_7 28.80 root data:Selection_6",
-		"└─Selection_6 28.80 cop le(test.t.a, 5), le(test.t.b, 5)",
+		"TableReader_7 35.91 root data:Selection_6",
+		"└─Selection_6 35.91 cop le(test.t.a, 5), le(test.t.b, 5)",
 		"  └─TableScan_5 80.00 cop table:t, range:[-inf,+inf], keep order:false",
 	))
 	statistics.RatioOfPseudoEstimate = 0.7

diff --git a/statistics/boostrap.go b/statistics/boostrap.go
@@ -40,15 +40,15 @@ func initStatsMeta4Chunk(is infoschema.InfoSchema, tables statsCache, iter *chun
 			TableID:     tableInfo.ID,
 			HaveTblID:   true,
 			Count:       row.GetInt64(3),
+			ModifyCount: row.GetInt64(2),
 			Columns:     make(map[int64]*Column, len(tableInfo.Columns)),
 			Indices:     make(map[int64]*Index, len(tableInfo.Indices)),
 			colName2Idx: make(map[string]int64, len(tableInfo.Columns)),
 			colName2ID:  make(map[string]int64, len(tableInfo.Columns)),
 		}
 		tbl := &Table{
-			HistColl:    newHistColl,
-			ModifyCount: row.GetInt64(2),
-			Version:     row.GetUint64(0),
+			HistColl: newHistColl,
+			Version:  row.GetUint64(0),
 		}
 		tables[tableID] = tbl
 	}

diff --git a/statistics/ddl_test.go b/statistics/ddl_test.go
@@ -126,7 +126,7 @@ func (s *testStatsCacheSuite) TestDDLHistogram(c *C) {
 	c.Assert(count, Equals, float64(2))
 	count, err = statsTbl.ColumnEqualRowCount(sc, types.NewIntDatum(1), tableInfo.Columns[3].ID)
 	c.Assert(err, IsNil)
-	c.Assert(count, Equals, float64(0))
+	c.Assert(count, Equals, float64(2))
 
 	testKit.MustExec("alter table t add column c4 datetime NOT NULL default CURRENT_TIMESTAMP")
 	err = h.HandleDDLEvent(<-h.DDLEventCh())

diff --git a/statistics/dump.go b/statistics/dump.go
@@ -122,15 +122,15 @@ func (h *Handle) LoadStatsFromJSON(is infoschema.InfoSchema, jsonTbl *JSONTable)
 // LoadStatsFromJSONToTable load statistic from JSONTable and return the Table of statistic.
 func (h *Handle) LoadStatsFromJSONToTable(tableInfo *model.TableInfo, jsonTbl *JSONTable) (*Table, error) {
 	newHistColl := HistColl{
-		TableID:   tableInfo.ID,
-		HaveTblID: true,
-		Count:     jsonTbl.Count,
-		Columns:   make(map[int64]*Column, len(jsonTbl.Columns)),
-		Indices:   make(map[int64]*Index, len(jsonTbl.Indices)),
+		TableID:     tableInfo.ID,
+		HaveTblID:   true,
+		Count:       jsonTbl.Count,
+		ModifyCount: jsonTbl.ModifyCount,
+		Columns:     make(map[int64]*Column, len(jsonTbl.Columns)),
+		Indices:     make(map[int64]*Index, len(jsonTbl.Indices)),
 	}
 	tbl := &Table{
-		HistColl:    newHistColl,
-		ModifyCount: jsonTbl.ModifyCount,
+		HistColl: newHistColl,
 	}
 	for id, jsonIdx := range jsonTbl.Indices {
 		for _, idxInfo := range tableInfo.Indices {

diff --git a/statistics/feedback.go b/statistics/feedback.go
@@ -714,11 +714,11 @@ func (q *QueryFeedback) recalculateExpectCount(h *Handle) error {
 	expected := 0.0
 	if isIndex {
 		idx := t.Indices[id]
-		expected, err = idx.getRowCount(sc, ranges)
+		expected, err = idx.getRowCount(sc, ranges, t.ModifyCount)
 		expected *= idx.getIncreaseFactor(t.Count)
 	} else {
 		c := t.Columns[id]
-		expected, err = c.getColumnRowCount(sc, ranges)
+		expected, err = c.getColumnRowCount(sc, ranges, t.ModifyCount)
 		expected *= c.getIncreaseFactor(t.Count)
 	}
 	if err != nil {

diff --git a/statistics/histogram.go b/statistics/histogram.go
@@ -669,6 +669,15 @@ func (hg *Histogram) AvgCountPerValue(totalCount int64) float64 {
 	return float64(totalCount) / curNDV
 }
 
+func (hg *Histogram) outOfRange(val types.Datum) bool {
+	if hg.Bounds == nil {
+		return true
+	}
+	len := hg.Bounds.NumRows()
+	return chunk.Compare(hg.Bounds.GetRow(0), 0, &val) > 0 ||
+		chunk.Compare(hg.Bounds.GetRow(len-1), 0, &val) < 0
+}
+
 // ErrorRate is the error rate of estimate row count by bucket and cm sketch.
 type ErrorRate struct {
 	ErrorTotal float64
@@ -716,19 +725,22 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (f
 	if val.IsNull() {
 		return float64(c.NullCount), nil
 	}
-	if c.CMSketch != nil {
-		count, err := c.CMSketch.queryValue(sc, val)
-		return float64(count), errors.Trace(err)
-	}
 	// all the values is null
 	if c.Histogram.Bounds == nil {
 		return 0.0, nil
 	}
+	if c.NDV > 0 && c.outOfRange(val) {
+		return c.totalRowCount() / (float64(c.NDV)), nil
+	}
+	if c.CMSketch != nil {
+		count, err := c.CMSketch.queryValue(sc, val)
+		return float64(count), errors.Trace(err)
+	}
 	return c.Histogram.equalRowCount(val), nil
 }
 
 // getColumnRowCount estimates the row count by a slice of Range.
-func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range) (float64, error) {
+func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range, modifyCount int64) (float64, error) {
 	var rowCount float64
 	for _, rg := range ranges {
 		cmp, err := rg.LowVal[0].CompareDatum(sc, &rg.HighVal[0])
@@ -749,6 +761,9 @@ func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 		}
 		// the interval case.
 		cnt := c.betweenRowCount(rg.LowVal[0], rg.HighVal[0])
+		if c.outOfRange(rg.LowVal[0]) || c.outOfRange(rg.HighVal[0]) {
+			cnt += float64(modifyCount) / outOfRangeBetweenRate
+		}
 		if rg.LowExclude {
 			lowCnt, err := c.equalRowCount(sc, rg.LowVal[0])
 			if err != nil {
@@ -787,13 +802,17 @@ func (idx *Index) String() string {
 }
 
 func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte) float64 {
+	val := types.NewBytesDatum(b)
+	if idx.NDV > 0 && idx.outOfRange(val) {
+		return idx.totalRowCount() / (float64(idx.NDV))
+	}
 	if idx.CMSketch != nil {
 		return float64(idx.CMSketch.QueryBytes(b))
 	}
-	return idx.Histogram.equalRowCount(types.NewBytesDatum(b))
+	return idx.Histogram.equalRowCount(val)
 }
 
-func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range) (float64, error) {
+func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range, modifyCount int64) (float64, error) {
 	totalCount := float64(0)
 	for _, indexRange := range indexRanges {
 		lb, err := codec.EncodeKey(sc, nil, indexRange.LowVal...)
@@ -820,6 +839,9 @@ func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*range
 		l := types.NewBytesDatum(lb)
 		r := types.NewBytesDatum(rb)
 		totalCount += idx.betweenRowCount(l, r)
+		if idx.outOfRange(l) || idx.outOfRange(r) {
+			totalCount += float64(modifyCount) / outOfRangeBetweenRate
+		}
 	}
 	if totalCount > idx.totalRowCount() {
 		totalCount = idx.totalRowCount()

diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go
@@ -33,6 +33,7 @@ import (
 	"github.com/pingcap/tidb/statistics"
 	"github.com/pingcap/tidb/types"
 	"github.com/pingcap/tidb/util/codec"
+	"github.com/pingcap/tidb/util/ranger"
 	"github.com/pingcap/tidb/util/testkit"
 )
 
@@ -157,7 +158,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) {
 		},
 		{
 			exprs:       "a >= 1 and b > 1 and a < 2",
-			selectivity: 0.01783264746,
+			selectivity: 0.01817558299,
 		},
 		{
 			exprs:       "a >= 1 and c > 1 and a < 2",
@@ -173,7 +174,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) {
 		},
 		{
 			exprs:       "b > 1",
-			selectivity: 0.96296296296,
+			selectivity: 0.98148148148,
 		},
 		{
 			exprs:       "a > 1 and b < 2 and c > 3 and d < 4 and e > 5",
@@ -238,6 +239,59 @@ func (s *testSelectivitySuite) TestDiscreteDistribution(c *C) {
 		"└─IndexScan_8 0.00 cop table:t, index:a, b, range:[\"tw\" -inf,\"tw\" 0), keep order:false"))
 }
 
+func getRange(start, end int64) []*ranger.Range {
+	ran := &ranger.Range{
+		LowVal:  []types.Datum{types.NewIntDatum(start)},
+		HighVal: []types.Datum{types.NewIntDatum(end)},
+	}
+	return []*ranger.Range{ran}
+}
+
+func (s *testSelectivitySuite) TestEstimationForUnknownValues(c *C) {
+	testKit := testkit.NewTestKit(c, s.store)
+	testKit.MustExec("use test")
+	testKit.MustExec("drop table if exists t")
+	testKit.MustExec("create table t(a int, b int, key idx(a, b))")
+	testKit.MustExec("analyze table t")
+	for i := 0; i < 10; i++ {
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i))
+	}
+	h := s.dom.StatsHandle()
+	h.DumpStatsDeltaToKV(statistics.DumpAll)
+	testKit.MustExec("analyze table t")
+	for i := 0; i < 10; i++ {
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i+10, i+10))
+	}
+	h.DumpStatsDeltaToKV(statistics.DumpAll)
+	c.Assert(h.Update(s.dom.InfoSchema()), IsNil)
+	table, err := s.dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
+	c.Assert(err, IsNil)
+	statsTbl := h.GetTableStats(table.Meta())
+
+	sc := &stmtctx.StatementContext{}
+	colID := table.Meta().Columns[0].ID
+	count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30))
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, 2.0)
+
+	count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30))
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, 4.2)
+
+	count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64))
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, 4.2)
+
+	idxID := table.Meta().Indices[0].ID
+	count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30))
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, 0.2)
+
+	count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(9, 30))
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, 2.2)
+}
+
 func BenchmarkSelectivity(b *testing.B) {
 	c := &C{}
 	s := &testSelectivitySuite{}