Skip to content

Commit

Permalink
statistics: refine index/column stats methods and remove statistics.C…
Browse files Browse the repository at this point in the history
…olumn.Count (#44406)

ref #42160, close #44404
  • Loading branch information
xuyifangreeneyes authored Jun 5, 2023
1 parent b768eb5 commit 546e491
Show file tree
Hide file tree
Showing 15 changed files with 70 additions and 143 deletions.
5 changes: 2 additions & 3 deletions planner/core/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6871,8 +6871,8 @@ func TestIssue32632(t *testing.T) {
"`S_ACCTBAL` decimal(15,2) NOT NULL," +
"`S_COMMENT` varchar(101) NOT NULL," +
"PRIMARY KEY (`S_SUPPKEY`) /*T![clustered_index] CLUSTERED */)")
tk.MustExec("analyze table partsupp;")
tk.MustExec("analyze table supplier;")
h := dom.StatsHandle()
require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh()))
tk.MustExec("set @@tidb_enforce_mpp = 1")

tbl1, err := dom.InfoSchema().TableByName(model.CIStr{O: "test", L: "test"}, model.CIStr{O: "partsupp", L: "partsupp"})
Expand All @@ -6883,7 +6883,6 @@ func TestIssue32632(t *testing.T) {
tbl1.Meta().TiFlashReplica = &model.TiFlashReplicaInfo{Count: 1, Available: true}
tbl2.Meta().TiFlashReplica = &model.TiFlashReplicaInfo{Count: 1, Available: true}

h := dom.StatsHandle()
statsTbl1 := h.GetTableStats(tbl1.Meta())
statsTbl1.Count = 800000
statsTbl2 := h.GetTableStats(tbl2.Meta())
Expand Down
30 changes: 27 additions & 3 deletions planner/core/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,37 @@ func (p *baseLogicalPlan) DeriveStats(childStats []*property.StatsInfo, selfSche
return profile, nil
}

// getTotalRowCount returns the total row count, which is obtained when collecting colHist.
func getTotalRowCount(statsTbl *statistics.Table, colHist *statistics.Column) int64 {
if colHist.IsFullLoad() {
return int64(colHist.TotalRowCount())
}
// If colHist is not fully loaded, we may still get its total row count from other index/column stats.
for _, idx := range statsTbl.Indices {
if idx.IsFullLoad() && idx.LastUpdateVersion == colHist.LastUpdateVersion {
return int64(idx.TotalRowCount())
}
}
for _, col := range statsTbl.Columns {
if col.IsFullLoad() && col.LastUpdateVersion == colHist.LastUpdateVersion {
return int64(col.TotalRowCount())
}
}
return 0
}

// getColumnNDV computes estimated NDV of specified column using the original
// histogram of `DataSource` which is retrieved from storage(not the derived one).
func (ds *DataSource) getColumnNDV(colID int64) (ndv float64) {
hist, ok := ds.statisticTable.Columns[colID]
if ok && hist.Count > 0 {
factor := float64(ds.statisticTable.Count) / float64(hist.Count)
ndv = float64(hist.Histogram.NDV) * factor
if ok && hist.IsStatsInitialized() {
ndv = float64(hist.Histogram.NDV)
// TODO: a better way to get the total row count derived from the last analyze.
analyzeCount := getTotalRowCount(ds.statisticTable, hist)
if analyzeCount > 0 {
factor := float64(ds.statisticTable.Count) / float64(analyzeCount)
ndv *= factor
}
} else {
ndv = float64(ds.statisticTable.Count) * distinctFactor
}
Expand Down
4 changes: 2 additions & 2 deletions planner/core/testdata/integration_suite_out.json
Original file line number Diff line number Diff line change
Expand Up @@ -9592,8 +9592,8 @@
" └─HashJoin 12500.00 mpp[tiflash] inner join, equal:[eq(test.supplier.s_suppkey, test.partsupp.ps_suppkey)]",
" ├─ExchangeReceiver(Build) 10000.00 mpp[tiflash] ",
" │ └─ExchangeSender 10000.00 mpp[tiflash] ExchangeType: Broadcast",
" │ └─TableFullScan 10000.00 mpp[tiflash] table:supplier keep order:false",
" └─TableFullScan(Probe) 800000.00 mpp[tiflash] table:partsupp keep order:false"
" │ └─TableFullScan 10000.00 mpp[tiflash] table:supplier keep order:false, stats:pseudo",
" └─TableFullScan(Probe) 800000.00 mpp[tiflash] table:partsupp keep order:false, stats:pseudo"
]
}
]
Expand Down
21 changes: 20 additions & 1 deletion statistics/column.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ type Column struct {
TopN *TopN
FMSketch *FMSketch
PhysicalID int64
Count int64
Info *model.ColumnInfo
IsHandle bool
ErrorRate
Expand Down Expand Up @@ -462,3 +461,23 @@ func (s StatsLoadedStatus) StatusToString() string {
}
return "unknown"
}

// IsAnalyzed indicates whether the column is analyzed.
// The set of IsAnalyzed columns is a subset of the set of StatsAvailable columns.
func (c *Column) IsAnalyzed() bool {
return c.StatsVer != Version0
}

// StatsAvailable indicates whether the column stats are collected.
// Note:
// 1. The function merely talks about whether the stats are collected, regardless of the stats loaded status.
// 2. The function is used to decide StatsLoadedStatus.statsInitialized when reading the column stats from storage.
// 3. There are two cases that StatsAvailable is true:
// a. IsAnalyzed is true.
// b. The column is newly-added/modified and its stats are generated according to the default value.
func (c *Column) StatsAvailable() bool {
// Typically, when the column is analyzed, StatsVer is set to Version1/Version2, so we check IsAnalyzed().
// However, when we add/modify a column, its stats are generated according to the default value without setting
// StatsVer, so we check NDV > 0 || NullCount > 0 for the case.
return c.IsAnalyzed() || c.NDV > 0 || c.NullCount > 0
}
39 changes: 1 addition & 38 deletions statistics/handle/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,23 +138,12 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, cache *stat
if colInfo == nil {
continue
}
var topnCount int64
// If this is stats of the Version2, we need to consider the topn's count as well.
// See the comments of Version2 for more details.
if statsVer >= statistics.Version2 {
var err error
topnCount, err = h.initTopNCountSum(tblID, id)
if err != nil {
terror.Log(err)
}
}
hist := statistics.NewHistogram(id, ndv, nullCount, version, &colInfo.FieldType, 0, totColSize)
hist.Correlation = row.GetFloat64(9)
col := &statistics.Column{
Histogram: *hist,
PhysicalID: table.PhysicalID,
Info: colInfo,
Count: nullCount + topnCount,
IsHandle: tbl.Meta().PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
Flag: row.GetInt64(10),
StatsVer: statsVer,
Expand Down Expand Up @@ -306,7 +295,6 @@ func (h *Handle) initStatsBuckets4Chunk(cache *statsCache, iter *chunk.Iterator4
if !ok {
continue
}
column.Count += row.GetInt64(3)
if !mysql.HasPriKeyFlag(column.Info.GetFlag()) {
continue
}
Expand Down Expand Up @@ -334,31 +322,6 @@ func (h *Handle) initStatsBuckets4Chunk(cache *statsCache, iter *chunk.Iterator4
}
}

func (h *Handle) initTopNCountSum(tableID, colID int64) (int64, error) {
// Before stats ver 2, histogram represents all data in this column.
// In stats ver 2, histogram + TopN represent all data in this column.
// So we need to add TopN total count here.
ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats)
selSQL := "select sum(count) from mysql.stats_top_n where table_id = %? and is_index = 0 and hist_id = %?"
rs, err := h.initStatsCtx.(sqlexec.SQLExecutor).ExecuteInternal(ctx, selSQL, tableID, colID)
if rs != nil {
defer terror.Call(rs.Close)
}
if err != nil {
return 0, err
}
req := rs.NewChunk(nil)
iter := chunk.NewIterator4Chunk(req)
err = rs.Next(ctx, req)
if err != nil {
return 0, err
}
if req.NumRows() == 0 {
return 0, nil
}
return iter.Begin().GetMyDecimal(0).ToInt()
}

func (h *Handle) initStatsBuckets(cache *statsCache) error {
ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats)
sql := "select HIGH_PRIORITY table_id, is_index, hist_id, count, repeats, lower_bound, upper_bound, ndv from mysql.stats_buckets order by table_id, is_index, hist_id, bucket_id"
Expand Down Expand Up @@ -438,7 +401,7 @@ func (h *Handle) InitStats(is infoschema.InfoSchema) (err error) {
// Set columns' stats status.
for _, table := range cache.Values() {
for _, col := range table.Columns {
if col.StatsVer != statistics.Version0 || col.Count > 0 {
if col.StatsAvailable() {
if mysql.HasPriKeyFlag(col.Info.GetFlag()) {
col.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
} else {
Expand Down
1 change: 0 additions & 1 deletion statistics/handle/dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,6 @@ func TableStatsFromJSON(tableInfo *model.TableInfo, physicalID int64, jsonTbl *J
StatsVer: statsVer,
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
}
col.Count = int64(col.TotalRowCount())
tbl.Columns[col.ID] = col
}
}
Expand Down
1 change: 0 additions & 1 deletion statistics/handle/dump_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ func requireTableEqual(t *testing.T, a *statistics.Table, b *statistics.Table) {
require.Equal(t, b.ModifyCount, a.ModifyCount)
require.Equal(t, len(b.Columns), len(a.Columns))
for i := range a.Columns {
require.Equal(t, b.Columns[i].Count, a.Columns[i].Count)
require.True(t, statistics.HistogramEqual(&a.Columns[i].Histogram, &b.Columns[i].Histogram, false))
if a.Columns[i].CMSketch == nil {
require.Nil(t, b.Columns[i].CMSketch)
Expand Down
54 changes: 3 additions & 51 deletions statistics/handle/handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -1131,11 +1131,7 @@ func (h *Handle) loadNeededColumnHistograms(reader *statsReader, col model.Table
IsHandle: c.IsHandle,
StatsVer: statsVer,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing colHist.
colHist.Count = int64(colHist.TotalRowCount())
// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
// So we need add colHist.Count > 0 here.
if statsVer != statistics.Version0 || colHist.Count > 0 {
if colHist.StatsAvailable() {
colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
// Reload the latest stats cache, otherwise the `updateStatsCache` may fail with high probability, because functions
Expand Down Expand Up @@ -1362,23 +1358,16 @@ func (h *Handle) columnStatsFromStorage(reader *statsReader, row chunk.Row, tabl
(col == nil || ((!col.IsStatsInitialized() || col.IsAllEvicted()) && col.LastUpdateVersion < histVer)) &&
!loadAll
if notNeedLoad {
count, err := h.columnCountFromStorage(reader, table.PhysicalID, histID, statsVer)
if err != nil {
return errors.Trace(err)
}
col = &statistics.Column{
PhysicalID: table.PhysicalID,
Histogram: *statistics.NewHistogram(histID, distinct, nullCount, histVer, &colInfo.FieldType, 0, totColSize),
Info: colInfo,
Count: count + nullCount,
ErrorRate: errorRate,
IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
Flag: flag,
StatsVer: statsVer,
}
// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
// So we need add col.Count > 0 here.
if statsVer != statistics.Version0 || col.Count > 0 {
if col.StatsAvailable() {
col.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus()
}
lastAnalyzePos.Copy(&col.LastAnalyzePos)
Expand Down Expand Up @@ -1415,11 +1404,7 @@ func (h *Handle) columnStatsFromStorage(reader *statsReader, row chunk.Row, tabl
Flag: flag,
StatsVer: statsVer,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing col.
col.Count = int64(col.TotalRowCount())
// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
// So we need add colHist.Count > 0 here.
if statsVer != statistics.Version0 || col.Count > 0 {
if col.StatsAvailable() {
col.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
lastAnalyzePos.Copy(&col.LastAnalyzePos)
Expand Down Expand Up @@ -1995,39 +1980,6 @@ func (h *Handle) histogramFromStorage(reader *statsReader, tableID int64, colID
return hg, nil
}

func (h *Handle) columnCountFromStorage(reader *statsReader, tableID, colID, statsVer int64) (int64, error) {
count := int64(0)
rows, _, err := reader.read("select sum(count) from mysql.stats_buckets where table_id = %? and is_index = 0 and hist_id = %?", tableID, colID)
if err != nil {
return 0, errors.Trace(err)
}
// If there doesn't exist any buckets, the SQL will return NULL. So we only use the result if it's not NULL.
if !rows[0].IsNull(0) {
count, err = rows[0].GetMyDecimal(0).ToInt()
if err != nil {
return 0, errors.Trace(err)
}
}

if statsVer >= statistics.Version2 {
// Before stats ver 2, histogram represents all data in this column.
// In stats ver 2, histogram + TopN represent all data in this column.
// So we need to add TopN total count here.
rows, _, err = reader.read("select sum(count) from mysql.stats_top_n where table_id = %? and is_index = 0 and hist_id = %?", tableID, colID)
if err != nil {
return 0, errors.Trace(err)
}
if !rows[0].IsNull(0) {
topNCount, err := rows[0].GetMyDecimal(0).ToInt()
if err != nil {
return 0, errors.Trace(err)
}
count += topNCount
}
}
return count, err
}

func (h *Handle) statsMetaByTableIDFromStorage(tableID int64, snapshot uint64) (version uint64, modifyCount, count int64, err error) {
ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats)
var rows []chunk.Row
Expand Down
6 changes: 1 addition & 5 deletions statistics/handle/handle_hist.go
Original file line number Diff line number Diff line change
Expand Up @@ -395,11 +395,7 @@ func (h *Handle) readStatsForOneItem(item model.TableItemID, w *statsWrapper, re
IsHandle: c.IsHandle,
StatsVer: statsVer,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence, we don't set Column.Count when initializing colHist.
colHist.Count = int64(colHist.TotalRowCount())
// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
// So we need add colHist.Count > 0 here.
if statsVer != statistics.Version0 || colHist.Count > 0 {
if colHist.StatsAvailable() {
colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
w.col = colHist
Expand Down
27 changes: 0 additions & 27 deletions statistics/handle/handle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ func assertTableEqual(t *testing.T, a *statistics.Table, b *statistics.Table) {
require.Equal(t, b.ModifyCount, a.ModifyCount)
require.Len(t, a.Columns, len(b.Columns))
for i := range a.Columns {
require.Equal(t, b.Columns[i].Count, a.Columns[i].Count)
require.True(t, statistics.HistogramEqual(&a.Columns[i].Histogram, &b.Columns[i].Histogram, false))
if a.Columns[i].CMSketch == nil {
require.Nil(t, b.Columns[i].CMSketch)
Expand Down Expand Up @@ -463,7 +462,6 @@ func TestLoadHist(t *testing.T) {
hist.TotColSize = temp

require.True(t, hist.CMSketch.Equal(newStatsTbl.Columns[id].CMSketch))
require.Equal(t, newStatsTbl.Columns[id].Count, hist.Count)
require.Equal(t, newStatsTbl.Columns[id].Info, hist.Info)
}
// Add column c3, we only update c3.
Expand Down Expand Up @@ -3198,31 +3196,6 @@ func TestIssues27147(t *testing.T) {
require.Equal(t, nil, err)
}

func TestColumnCountFromStorage(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
testKit := testkit.NewTestKit(t, store)
do := dom
h := do.StatsHandle()
originLease := h.Lease()
defer h.SetLease(originLease)
// `Update` will not use load by need strategy when `Lease` is 0, and `InitStats` is only called when
// `Lease` is not 0, so here we just change it.
h.SetLease(time.Millisecond)
testKit.MustExec("use test")
testKit.MustExec("set tidb_analyze_version = 2")
testKit.MustExec("create table tt (c int)")
testKit.MustExec("insert into tt values(1), (2)")
testKit.MustExec("analyze table tt")
is := do.InfoSchema()
h = do.StatsHandle()
tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("tt"))
require.NoError(t, err)
tblInfo := tbl.Meta()
h.TableStatsFromStorage(tblInfo, tblInfo.ID, false, 0)
statsTbl := h.GetTableStats(tblInfo)
require.Equal(t, int64(2), statsTbl.Columns[tblInfo.Columns[0].ID].Count)
}

func testIncrementalModifyCountUpdateHelper(analyzeSnapshot bool) func(*testing.T) {
return func(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
Expand Down
4 changes: 2 additions & 2 deletions statistics/handle/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -978,12 +978,12 @@ var AutoAnalyzeMinCnt int64 = 1000
// TableAnalyzed checks if the table is analyzed.
func TableAnalyzed(tbl *statistics.Table) bool {
for _, col := range tbl.Columns {
if col.Count > 0 {
if col.IsAnalyzed() {
return true
}
}
for _, idx := range tbl.Indices {
if idx.Histogram.Len() > 0 {
if idx.IsAnalyzed() {
return true
}
}
Expand Down
2 changes: 1 addition & 1 deletion statistics/handle/update_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1370,7 +1370,7 @@ func TestFeedbackWithStatsVer2(t *testing.T) {

func TestNeedAnalyzeTable(t *testing.T) {
columns := map[int64]*statistics.Column{}
columns[1] = &statistics.Column{Count: 1}
columns[1] = &statistics.Column{StatsVer: statistics.Version2}
tests := []struct {
tbl *statistics.Table
ratio float64
Expand Down
4 changes: 3 additions & 1 deletion statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -1598,7 +1598,9 @@ func (s StatsLoadedStatus) IsLoadNeeded() bool {
if s.statsInitialized {
return s.evictedStatus > allLoaded
}
return true
// If statsInitialized is false, it means there is no stats for the column/index in the storage.
// Hence, we don't need to trigger the task of loading the column/index stats.
return false
}

// IsEssentialStatsLoaded indicates whether the essential statistics is loaded.
Expand Down
5 changes: 5 additions & 0 deletions statistics/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -489,3 +489,8 @@ func matchPrefix(row chunk.Row, colIdx int, ad *types.Datum) bool {
}
return false
}

// IsAnalyzed indicates whether the index is analyzed.
func (idx *Index) IsAnalyzed() bool {
return idx.StatsVer != Version0
}
Loading

0 comments on commit 546e491

Please sign in to comment.