statistics: refine index/column stats methods and remove statistics.C…

…olumn.Count (#44406) ref #42160, close #44404
pingcap · Jun 5, 2023 · 546e491 · 546e491
1 parent b768eb5
commit 546e491
Show file tree

Hide file tree

Showing 15 changed files with 70 additions and 143 deletions.
diff --git a/planner/core/integration_test.go b/planner/core/integration_test.go
@@ -6871,8 +6871,8 @@ func TestIssue32632(t *testing.T) {
 		"`S_ACCTBAL` decimal(15,2) NOT NULL," +
 		"`S_COMMENT` varchar(101) NOT NULL," +
 		"PRIMARY KEY (`S_SUPPKEY`) /*T![clustered_index] CLUSTERED */)")
-	tk.MustExec("analyze table partsupp;")
-	tk.MustExec("analyze table supplier;")
+	h := dom.StatsHandle()
+	require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh()))
 	tk.MustExec("set @@tidb_enforce_mpp = 1")
 
 	tbl1, err := dom.InfoSchema().TableByName(model.CIStr{O: "test", L: "test"}, model.CIStr{O: "partsupp", L: "partsupp"})
@@ -6883,7 +6883,6 @@ func TestIssue32632(t *testing.T) {
 	tbl1.Meta().TiFlashReplica = &model.TiFlashReplicaInfo{Count: 1, Available: true}
 	tbl2.Meta().TiFlashReplica = &model.TiFlashReplicaInfo{Count: 1, Available: true}
 
-	h := dom.StatsHandle()
 	statsTbl1 := h.GetTableStats(tbl1.Meta())
 	statsTbl1.Count = 800000
 	statsTbl2 := h.GetTableStats(tbl2.Meta())

diff --git a/planner/core/stats.go b/planner/core/stats.go
@@ -162,13 +162,37 @@ func (p *baseLogicalPlan) DeriveStats(childStats []*property.StatsInfo, selfSche
 	return profile, nil
 }
 
+// getTotalRowCount returns the total row count, which is obtained when collecting colHist.
+func getTotalRowCount(statsTbl *statistics.Table, colHist *statistics.Column) int64 {
+	if colHist.IsFullLoad() {
+		return int64(colHist.TotalRowCount())
+	}
+	// If colHist is not fully loaded, we may still get its total row count from other index/column stats.
+	for _, idx := range statsTbl.Indices {
+		if idx.IsFullLoad() && idx.LastUpdateVersion == colHist.LastUpdateVersion {
+			return int64(idx.TotalRowCount())
+		}
+	}
+	for _, col := range statsTbl.Columns {
+		if col.IsFullLoad() && col.LastUpdateVersion == colHist.LastUpdateVersion {
+			return int64(col.TotalRowCount())
+		}
+	}
+	return 0
+}
+
 // getColumnNDV computes estimated NDV of specified column using the original
 // histogram of `DataSource` which is retrieved from storage(not the derived one).
 func (ds *DataSource) getColumnNDV(colID int64) (ndv float64) {
 	hist, ok := ds.statisticTable.Columns[colID]
-	if ok && hist.Count > 0 {
-		factor := float64(ds.statisticTable.Count) / float64(hist.Count)
-		ndv = float64(hist.Histogram.NDV) * factor
+	if ok && hist.IsStatsInitialized() {
+		ndv = float64(hist.Histogram.NDV)
+		// TODO: a better way to get the total row count derived from the last analyze.
+		analyzeCount := getTotalRowCount(ds.statisticTable, hist)
+		if analyzeCount > 0 {
+			factor := float64(ds.statisticTable.Count) / float64(analyzeCount)
+			ndv *= factor
+		}
 	} else {
 		ndv = float64(ds.statisticTable.Count) * distinctFactor
 	}

diff --git a/planner/core/testdata/integration_suite_out.json b/planner/core/testdata/integration_suite_out.json
@@ -9592,8 +9592,8 @@
           "        └─HashJoin 12500.00 mpp[tiflash]  inner join, equal:[eq(test.supplier.s_suppkey, test.partsupp.ps_suppkey)]",
           "          ├─ExchangeReceiver(Build) 10000.00 mpp[tiflash]  ",
           "          │ └─ExchangeSender 10000.00 mpp[tiflash]  ExchangeType: Broadcast",
-          "          │   └─TableFullScan 10000.00 mpp[tiflash] table:supplier keep order:false",
-          "          └─TableFullScan(Probe) 800000.00 mpp[tiflash] table:partsupp keep order:false"
+          "          │   └─TableFullScan 10000.00 mpp[tiflash] table:supplier keep order:false, stats:pseudo",
+          "          └─TableFullScan(Probe) 800000.00 mpp[tiflash] table:partsupp keep order:false, stats:pseudo"
         ]
       }
     ]

diff --git a/statistics/column.go b/statistics/column.go
@@ -39,7 +39,6 @@ type Column struct {
 	TopN       *TopN
 	FMSketch   *FMSketch
 	PhysicalID int64
-	Count      int64
 	Info       *model.ColumnInfo
 	IsHandle   bool
 	ErrorRate
@@ -462,3 +461,23 @@ func (s StatsLoadedStatus) StatusToString() string {
 	}
 	return "unknown"
 }
+
+// IsAnalyzed indicates whether the column is analyzed.
+// The set of IsAnalyzed columns is a subset of the set of StatsAvailable columns.
+func (c *Column) IsAnalyzed() bool {
+	return c.StatsVer != Version0
+}
+
+// StatsAvailable indicates whether the column stats are collected.
+// Note:
+//  1. The function merely talks about whether the stats are collected, regardless of the stats loaded status.
+//  2. The function is used to decide StatsLoadedStatus.statsInitialized when reading the column stats from storage.
+//  3. There are two cases that StatsAvailable is true:
+//     a. IsAnalyzed is true.
+//     b. The column is newly-added/modified and its stats are generated according to the default value.
+func (c *Column) StatsAvailable() bool {
+	// Typically, when the column is analyzed, StatsVer is set to Version1/Version2, so we check IsAnalyzed().
+	// However, when we add/modify a column, its stats are generated according to the default value without setting
+	// StatsVer, so we check NDV > 0 || NullCount > 0 for the case.
+	return c.IsAnalyzed() || c.NDV > 0 || c.NullCount > 0
+}
diff --git a/statistics/handle/bootstrap.go b/statistics/handle/bootstrap.go
@@ -138,23 +138,12 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, cache *stat
 			if colInfo == nil {
 				continue
 			}
-			var topnCount int64
-			// If this is stats of the Version2, we need to consider the topn's count as well.
-			// See the comments of Version2 for more details.
-			if statsVer >= statistics.Version2 {
-				var err error
-				topnCount, err = h.initTopNCountSum(tblID, id)
-				if err != nil {
-					terror.Log(err)
-				}
-			}
 			hist := statistics.NewHistogram(id, ndv, nullCount, version, &colInfo.FieldType, 0, totColSize)
 			hist.Correlation = row.GetFloat64(9)
 			col := &statistics.Column{
 				Histogram:  *hist,
 				PhysicalID: table.PhysicalID,
 				Info:       colInfo,
-				Count:      nullCount + topnCount,
 				IsHandle:   tbl.Meta().PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
 				Flag:       row.GetInt64(10),
 				StatsVer:   statsVer,
@@ -306,7 +295,6 @@ func (h *Handle) initStatsBuckets4Chunk(cache *statsCache, iter *chunk.Iterator4
 			if !ok {
 				continue
 			}
-			column.Count += row.GetInt64(3)
 			if !mysql.HasPriKeyFlag(column.Info.GetFlag()) {
 				continue
 			}
@@ -334,31 +322,6 @@ func (h *Handle) initStatsBuckets4Chunk(cache *statsCache, iter *chunk.Iterator4
 	}
 }
 
-func (h *Handle) initTopNCountSum(tableID, colID int64) (int64, error) {
-	// Before stats ver 2, histogram represents all data in this column.
-	// In stats ver 2, histogram + TopN represent all data in this column.
-	// So we need to add TopN total count here.
-	ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats)
-	selSQL := "select sum(count) from mysql.stats_top_n where table_id = %? and is_index = 0 and hist_id = %?"
-	rs, err := h.initStatsCtx.(sqlexec.SQLExecutor).ExecuteInternal(ctx, selSQL, tableID, colID)
-	if rs != nil {
-		defer terror.Call(rs.Close)
-	}
-	if err != nil {
-		return 0, err
-	}
-	req := rs.NewChunk(nil)
-	iter := chunk.NewIterator4Chunk(req)
-	err = rs.Next(ctx, req)
-	if err != nil {
-		return 0, err
-	}
-	if req.NumRows() == 0 {
-		return 0, nil
-	}
-	return iter.Begin().GetMyDecimal(0).ToInt()
-}
-
 func (h *Handle) initStatsBuckets(cache *statsCache) error {
 	ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats)
 	sql := "select HIGH_PRIORITY table_id, is_index, hist_id, count, repeats, lower_bound, upper_bound, ndv from mysql.stats_buckets order by table_id, is_index, hist_id, bucket_id"
@@ -438,7 +401,7 @@ func (h *Handle) InitStats(is infoschema.InfoSchema) (err error) {
 	// Set columns' stats status.
 	for _, table := range cache.Values() {
 		for _, col := range table.Columns {
-			if col.StatsVer != statistics.Version0 || col.Count > 0 {
+			if col.StatsAvailable() {
 				if mysql.HasPriKeyFlag(col.Info.GetFlag()) {
 					col.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
 				} else {

diff --git a/statistics/handle/dump.go b/statistics/handle/dump.go
@@ -357,7 +357,6 @@ func TableStatsFromJSON(tableInfo *model.TableInfo, physicalID int64, jsonTbl *J
 				StatsVer:          statsVer,
 				StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
 			}
-			col.Count = int64(col.TotalRowCount())
 			tbl.Columns[col.ID] = col
 		}
 	}

diff --git a/statistics/handle/dump_test.go b/statistics/handle/dump_test.go
@@ -33,7 +33,6 @@ func requireTableEqual(t *testing.T, a *statistics.Table, b *statistics.Table) {
 	require.Equal(t, b.ModifyCount, a.ModifyCount)
 	require.Equal(t, len(b.Columns), len(a.Columns))
 	for i := range a.Columns {
-		require.Equal(t, b.Columns[i].Count, a.Columns[i].Count)
 		require.True(t, statistics.HistogramEqual(&a.Columns[i].Histogram, &b.Columns[i].Histogram, false))
 		if a.Columns[i].CMSketch == nil {
 			require.Nil(t, b.Columns[i].CMSketch)

diff --git a/statistics/handle/handle.go b/statistics/handle/handle.go
@@ -1131,11 +1131,7 @@ func (h *Handle) loadNeededColumnHistograms(reader *statsReader, col model.Table
 		IsHandle:   c.IsHandle,
 		StatsVer:   statsVer,
 	}
-	// Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing colHist.
-	colHist.Count = int64(colHist.TotalRowCount())
-	// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
-	// So we need add colHist.Count > 0 here.
-	if statsVer != statistics.Version0 || colHist.Count > 0 {
+	if colHist.StatsAvailable() {
 		colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
 	}
 	// Reload the latest stats cache, otherwise the `updateStatsCache` may fail with high probability, because functions
@@ -1362,23 +1358,16 @@ func (h *Handle) columnStatsFromStorage(reader *statsReader, row chunk.Row, tabl
 			(col == nil || ((!col.IsStatsInitialized() || col.IsAllEvicted()) && col.LastUpdateVersion < histVer)) &&
 			!loadAll
 		if notNeedLoad {
-			count, err := h.columnCountFromStorage(reader, table.PhysicalID, histID, statsVer)
-			if err != nil {
-				return errors.Trace(err)
-			}
 			col = &statistics.Column{
 				PhysicalID: table.PhysicalID,
 				Histogram:  *statistics.NewHistogram(histID, distinct, nullCount, histVer, &colInfo.FieldType, 0, totColSize),
 				Info:       colInfo,
-				Count:      count + nullCount,
 				ErrorRate:  errorRate,
 				IsHandle:   tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
 				Flag:       flag,
 				StatsVer:   statsVer,
 			}
-			// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
-			// So we need add col.Count > 0 here.
-			if statsVer != statistics.Version0 || col.Count > 0 {
+			if col.StatsAvailable() {
 				col.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus()
 			}
 			lastAnalyzePos.Copy(&col.LastAnalyzePos)
@@ -1415,11 +1404,7 @@ func (h *Handle) columnStatsFromStorage(reader *statsReader, row chunk.Row, tabl
 				Flag:       flag,
 				StatsVer:   statsVer,
 			}
-			// Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing col.
-			col.Count = int64(col.TotalRowCount())
-			// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
-			// So we need add colHist.Count > 0 here.
-			if statsVer != statistics.Version0 || col.Count > 0 {
+			if col.StatsAvailable() {
 				col.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
 			}
 			lastAnalyzePos.Copy(&col.LastAnalyzePos)
@@ -1995,39 +1980,6 @@ func (h *Handle) histogramFromStorage(reader *statsReader, tableID int64, colID
 	return hg, nil
 }
 
-func (h *Handle) columnCountFromStorage(reader *statsReader, tableID, colID, statsVer int64) (int64, error) {
-	count := int64(0)
-	rows, _, err := reader.read("select sum(count) from mysql.stats_buckets where table_id = %? and is_index = 0 and hist_id = %?", tableID, colID)
-	if err != nil {
-		return 0, errors.Trace(err)
-	}
-	// If there doesn't exist any buckets, the SQL will return NULL. So we only use the result if it's not NULL.
-	if !rows[0].IsNull(0) {
-		count, err = rows[0].GetMyDecimal(0).ToInt()
-		if err != nil {
-			return 0, errors.Trace(err)
-		}
-	}
-
-	if statsVer >= statistics.Version2 {
-		// Before stats ver 2, histogram represents all data in this column.
-		// In stats ver 2, histogram + TopN represent all data in this column.
-		// So we need to add TopN total count here.
-		rows, _, err = reader.read("select sum(count) from mysql.stats_top_n where table_id = %? and is_index = 0 and hist_id = %?", tableID, colID)
-		if err != nil {
-			return 0, errors.Trace(err)
-		}
-		if !rows[0].IsNull(0) {
-			topNCount, err := rows[0].GetMyDecimal(0).ToInt()
-			if err != nil {
-				return 0, errors.Trace(err)
-			}
-			count += topNCount
-		}
-	}
-	return count, err
-}
-
 func (h *Handle) statsMetaByTableIDFromStorage(tableID int64, snapshot uint64) (version uint64, modifyCount, count int64, err error) {
 	ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats)
 	var rows []chunk.Row

diff --git a/statistics/handle/handle_hist.go b/statistics/handle/handle_hist.go
@@ -395,11 +395,7 @@ func (h *Handle) readStatsForOneItem(item model.TableItemID, w *statsWrapper, re
 			IsHandle:   c.IsHandle,
 			StatsVer:   statsVer,
 		}
-		// Column.Count is calculated by Column.TotalRowCount(). Hence, we don't set Column.Count when initializing colHist.
-		colHist.Count = int64(colHist.TotalRowCount())
-		// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
-		// So we need add colHist.Count > 0 here.
-		if statsVer != statistics.Version0 || colHist.Count > 0 {
+		if colHist.StatsAvailable() {
 			colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
 		}
 		w.col = colHist

diff --git a/statistics/handle/handle_test.go b/statistics/handle/handle_test.go
@@ -149,7 +149,6 @@ func assertTableEqual(t *testing.T, a *statistics.Table, b *statistics.Table) {
 	require.Equal(t, b.ModifyCount, a.ModifyCount)
 	require.Len(t, a.Columns, len(b.Columns))
 	for i := range a.Columns {
-		require.Equal(t, b.Columns[i].Count, a.Columns[i].Count)
 		require.True(t, statistics.HistogramEqual(&a.Columns[i].Histogram, &b.Columns[i].Histogram, false))
 		if a.Columns[i].CMSketch == nil {
 			require.Nil(t, b.Columns[i].CMSketch)
@@ -463,7 +462,6 @@ func TestLoadHist(t *testing.T) {
 		hist.TotColSize = temp
 
 		require.True(t, hist.CMSketch.Equal(newStatsTbl.Columns[id].CMSketch))
-		require.Equal(t, newStatsTbl.Columns[id].Count, hist.Count)
 		require.Equal(t, newStatsTbl.Columns[id].Info, hist.Info)
 	}
 	// Add column c3, we only update c3.
@@ -3198,31 +3196,6 @@ func TestIssues27147(t *testing.T) {
 	require.Equal(t, nil, err)
 }
 
-func TestColumnCountFromStorage(t *testing.T) {
-	store, dom := testkit.CreateMockStoreAndDomain(t)
-	testKit := testkit.NewTestKit(t, store)
-	do := dom
-	h := do.StatsHandle()
-	originLease := h.Lease()
-	defer h.SetLease(originLease)
-	// `Update` will not use load by need strategy when `Lease` is 0, and `InitStats` is only called when
-	// `Lease` is not 0, so here we just change it.
-	h.SetLease(time.Millisecond)
-	testKit.MustExec("use test")
-	testKit.MustExec("set tidb_analyze_version = 2")
-	testKit.MustExec("create table tt (c int)")
-	testKit.MustExec("insert into tt values(1), (2)")
-	testKit.MustExec("analyze table tt")
-	is := do.InfoSchema()
-	h = do.StatsHandle()
-	tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("tt"))
-	require.NoError(t, err)
-	tblInfo := tbl.Meta()
-	h.TableStatsFromStorage(tblInfo, tblInfo.ID, false, 0)
-	statsTbl := h.GetTableStats(tblInfo)
-	require.Equal(t, int64(2), statsTbl.Columns[tblInfo.Columns[0].ID].Count)
-}
-
 func testIncrementalModifyCountUpdateHelper(analyzeSnapshot bool) func(*testing.T) {
 	return func(t *testing.T) {
 		store, dom := testkit.CreateMockStoreAndDomain(t)

diff --git a/statistics/handle/update.go b/statistics/handle/update.go
@@ -978,12 +978,12 @@ var AutoAnalyzeMinCnt int64 = 1000
 // TableAnalyzed checks if the table is analyzed.
 func TableAnalyzed(tbl *statistics.Table) bool {
 	for _, col := range tbl.Columns {
-		if col.Count > 0 {
+		if col.IsAnalyzed() {
 			return true
 		}
 	}
 	for _, idx := range tbl.Indices {
-		if idx.Histogram.Len() > 0 {
+		if idx.IsAnalyzed() {
 			return true
 		}
 	}

diff --git a/statistics/handle/update_test.go b/statistics/handle/update_test.go
@@ -1370,7 +1370,7 @@ func TestFeedbackWithStatsVer2(t *testing.T) {
 
 func TestNeedAnalyzeTable(t *testing.T) {
 	columns := map[int64]*statistics.Column{}
-	columns[1] = &statistics.Column{Count: 1}
+	columns[1] = &statistics.Column{StatsVer: statistics.Version2}
 	tests := []struct {
 		tbl    *statistics.Table
 		ratio  float64

diff --git a/statistics/histogram.go b/statistics/histogram.go
@@ -1598,7 +1598,9 @@ func (s StatsLoadedStatus) IsLoadNeeded() bool {
 	if s.statsInitialized {
 		return s.evictedStatus > allLoaded
 	}
-	return true
+	// If statsInitialized is false, it means there is no stats for the column/index in the storage.
+	// Hence, we don't need to trigger the task of loading the column/index stats.
+	return false
 }
 
 // IsEssentialStatsLoaded indicates whether the essential statistics is loaded.

diff --git a/statistics/index.go b/statistics/index.go
@@ -489,3 +489,8 @@ func matchPrefix(row chunk.Row, colIdx int, ad *types.Datum) bool {
 	}
 	return false
 }
+
+// IsAnalyzed indicates whether the index is analyzed.
+func (idx *Index) IsAnalyzed() bool {
+	return idx.StatsVer != Version0
+}