Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

statistics: remove statistics.Column.Count (#43033) #44405

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,475 changes: 3,475 additions & 0 deletions planner/core/casetest/integration_test.go

Large diffs are not rendered by default.

30 changes: 30 additions & 0 deletions planner/core/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,43 @@ func (p *baseLogicalPlan) DeriveStats(childStats []*property.StatsInfo, selfSche
return profile, nil
}

// getTotalRowCount returns the total row count, which is obtained when collecting colHist.
func getTotalRowCount(statsTbl *statistics.Table, colHist *statistics.Column) int64 {
if colHist.IsFullLoad() {
return int64(colHist.TotalRowCount())
}
// If colHist is not fully loaded, we may still get its total row count from other index/column stats.
for _, idx := range statsTbl.Indices {
if idx.IsFullLoad() && idx.LastUpdateVersion == colHist.LastUpdateVersion {
return int64(idx.TotalRowCount())
}
}
for _, col := range statsTbl.Columns {
if col.IsFullLoad() && col.LastUpdateVersion == colHist.LastUpdateVersion {
return int64(col.TotalRowCount())
}
}
return 0
}

// getColumnNDV computes estimated NDV of specified column using the original
// histogram of `DataSource` which is retrieved from storage(not the derived one).
func (ds *DataSource) getColumnNDV(colID int64) (ndv float64) {
hist, ok := ds.statisticTable.Columns[colID]
<<<<<<< HEAD
if ok && hist.Count > 0 {
factor := float64(ds.statisticTable.Count) / float64(hist.Count)
ndv = float64(hist.Histogram.NDV) * factor
=======
if ok && hist.IsStatsInitialized() {
ndv = float64(hist.Histogram.NDV)
// TODO: a better way to get the total row count derived from the last analyze.
analyzeCount := getTotalRowCount(ds.statisticTable, hist)
if analyzeCount > 0 {
factor := float64(ds.statisticTable.RealtimeCount) / float64(analyzeCount)
ndv *= factor
}
>>>>>>> 579f47e4122 (statistics: remove statistics.Column.Count (#43033))
} else {
ndv = float64(ds.statisticTable.Count) * distinctFactor
}
Expand Down
6 changes: 6 additions & 0 deletions planner/core/testdata/integration_suite_out.json
Original file line number Diff line number Diff line change
Expand Up @@ -9591,9 +9591,15 @@
" └─Projection 12500.00 mpp[tiflash] test.partsupp.ps_supplycost",
" └─HashJoin 12500.00 mpp[tiflash] inner join, equal:[eq(test.supplier.s_suppkey, test.partsupp.ps_suppkey)]",
" ├─ExchangeReceiver(Build) 10000.00 mpp[tiflash] ",
<<<<<<< HEAD:planner/core/testdata/integration_suite_out.json
" │ └─ExchangeSender 10000.00 mpp[tiflash] ExchangeType: Broadcast",
" │ └─TableFullScan 10000.00 mpp[tiflash] table:supplier keep order:false",
" └─TableFullScan(Probe) 800000.00 mpp[tiflash] table:partsupp keep order:false"
=======
" │ └─ExchangeSender 10000.00 mpp[tiflash] ExchangeType: Broadcast, Compression: FAST",
" │ └─TableFullScan 10000.00 mpp[tiflash] table:supplier keep order:false, stats:pseudo",
" └─TableFullScan(Probe) 800000.00 mpp[tiflash] table:partsupp keep order:false, stats:pseudo"
>>>>>>> 579f47e4122 (statistics: remove statistics.Column.Count (#43033)):planner/core/casetest/testdata/integration_suite_out.json
]
}
]
Expand Down
1 change: 0 additions & 1 deletion statistics/column.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ type Column struct {
TopN *TopN
FMSketch *FMSketch
PhysicalID int64
Count int64
Info *model.ColumnInfo
IsHandle bool
ErrorRate
Expand Down
37 changes: 0 additions & 37 deletions statistics/handle/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,23 +138,12 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, cache *stat
if colInfo == nil {
continue
}
var topnCount int64
// If this is stats of the Version2, we need to consider the topn's count as well.
// See the comments of Version2 for more details.
if statsVer >= statistics.Version2 {
var err error
topnCount, err = h.initTopNCountSum(tblID, id)
if err != nil {
terror.Log(err)
}
}
hist := statistics.NewHistogram(id, ndv, nullCount, version, &colInfo.FieldType, 0, totColSize)
hist.Correlation = row.GetFloat64(9)
col := &statistics.Column{
Histogram: *hist,
PhysicalID: table.PhysicalID,
Info: colInfo,
Count: nullCount + topnCount,
IsHandle: tbl.Meta().PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
Flag: row.GetInt64(10),
StatsVer: statsVer,
Expand Down Expand Up @@ -306,7 +295,6 @@ func (h *Handle) initStatsBuckets4Chunk(cache *statsCache, iter *chunk.Iterator4
if !ok {
continue
}
column.Count += row.GetInt64(3)
if !mysql.HasPriKeyFlag(column.Info.GetFlag()) {
continue
}
Expand Down Expand Up @@ -334,31 +322,6 @@ func (h *Handle) initStatsBuckets4Chunk(cache *statsCache, iter *chunk.Iterator4
}
}

func (h *Handle) initTopNCountSum(tableID, colID int64) (int64, error) {
// Before stats ver 2, histogram represents all data in this column.
// In stats ver 2, histogram + TopN represent all data in this column.
// So we need to add TopN total count here.
ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats)
selSQL := "select sum(count) from mysql.stats_top_n where table_id = %? and is_index = 0 and hist_id = %?"
rs, err := h.initStatsCtx.(sqlexec.SQLExecutor).ExecuteInternal(ctx, selSQL, tableID, colID)
if rs != nil {
defer terror.Call(rs.Close)
}
if err != nil {
return 0, err
}
req := rs.NewChunk(nil)
iter := chunk.NewIterator4Chunk(req)
err = rs.Next(ctx, req)
if err != nil {
return 0, err
}
if req.NumRows() == 0 {
return 0, nil
}
return iter.Begin().GetMyDecimal(0).ToInt()
}

func (h *Handle) initStatsBuckets(cache *statsCache) error {
ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats)
sql := "select HIGH_PRIORITY table_id, is_index, hist_id, count, repeats, lower_bound, upper_bound, ndv from mysql.stats_buckets order by table_id, is_index, hist_id, bucket_id"
Expand Down
1 change: 0 additions & 1 deletion statistics/handle/dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,6 @@ func TableStatsFromJSON(tableInfo *model.TableInfo, physicalID int64, jsonTbl *J
StatsVer: statsVer,
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
}
col.Count = int64(col.TotalRowCount())
tbl.Columns[col.ID] = col
}
}
Expand Down
1 change: 0 additions & 1 deletion statistics/handle/dump_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ func requireTableEqual(t *testing.T, a *statistics.Table, b *statistics.Table) {
require.Equal(t, b.ModifyCount, a.ModifyCount)
require.Equal(t, len(b.Columns), len(a.Columns))
for i := range a.Columns {
require.Equal(t, b.Columns[i].Count, a.Columns[i].Count)
require.True(t, statistics.HistogramEqual(&a.Columns[i].Histogram, &b.Columns[i].Histogram, false))
if a.Columns[i].CMSketch == nil {
require.Nil(t, b.Columns[i].CMSketch)
Expand Down
4 changes: 4 additions & 0 deletions statistics/handle/handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -1131,11 +1131,15 @@ func (h *Handle) loadNeededColumnHistograms(reader *statsReader, col model.Table
IsHandle: c.IsHandle,
StatsVer: statsVer,
}
<<<<<<< HEAD
// Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing colHist.
colHist.Count = int64(colHist.TotalRowCount())
// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
// So we need add colHist.Count > 0 here.
if statsVer != statistics.Version0 || colHist.Count > 0 {
=======
if colHist.StatsAvailable() {
>>>>>>> 579f47e4122 (statistics: remove statistics.Column.Count (#43033))
colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
// Reload the latest stats cache, otherwise the `updateStatsCache` may fail with high probability, because functions
Expand Down
4 changes: 4 additions & 0 deletions statistics/handle/handle_hist.go
Original file line number Diff line number Diff line change
Expand Up @@ -395,11 +395,15 @@ func (h *Handle) readStatsForOneItem(item model.TableItemID, w *statsWrapper, re
IsHandle: c.IsHandle,
StatsVer: statsVer,
}
<<<<<<< HEAD
// Column.Count is calculated by Column.TotalRowCount(). Hence, we don't set Column.Count when initializing colHist.
colHist.Count = int64(colHist.TotalRowCount())
// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
// So we need add colHist.Count > 0 here.
if statsVer != statistics.Version0 || colHist.Count > 0 {
=======
if colHist.StatsAvailable() {
>>>>>>> 579f47e4122 (statistics: remove statistics.Column.Count (#43033))
colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
w.col = colHist
Expand Down
26 changes: 0 additions & 26 deletions statistics/handle/handle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,6 @@ func TestLoadHist(t *testing.T) {
hist.TotColSize = temp

require.True(t, hist.CMSketch.Equal(newStatsTbl.Columns[id].CMSketch))
require.Equal(t, newStatsTbl.Columns[id].Count, hist.Count)
require.Equal(t, newStatsTbl.Columns[id].Info, hist.Info)
}
// Add column c3, we only update c3.
Expand Down Expand Up @@ -3198,31 +3197,6 @@ func TestIssues27147(t *testing.T) {
require.Equal(t, nil, err)
}

func TestColumnCountFromStorage(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
testKit := testkit.NewTestKit(t, store)
do := dom
h := do.StatsHandle()
originLease := h.Lease()
defer h.SetLease(originLease)
// `Update` will not use load by need strategy when `Lease` is 0, and `InitStats` is only called when
// `Lease` is not 0, so here we just change it.
h.SetLease(time.Millisecond)
testKit.MustExec("use test")
testKit.MustExec("set tidb_analyze_version = 2")
testKit.MustExec("create table tt (c int)")
testKit.MustExec("insert into tt values(1), (2)")
testKit.MustExec("analyze table tt")
is := do.InfoSchema()
h = do.StatsHandle()
tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("tt"))
require.NoError(t, err)
tblInfo := tbl.Meta()
h.TableStatsFromStorage(tblInfo, tblInfo.ID, false, 0)
statsTbl := h.GetTableStats(tblInfo)
require.Equal(t, int64(2), statsTbl.Columns[tblInfo.Columns[0].ID].Count)
}

func testIncrementalModifyCountUpdateHelper(analyzeSnapshot bool) func(*testing.T) {
return func(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
Expand Down
80 changes: 80 additions & 0 deletions statistics/handle/internal/testutil.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package internal

import (
"testing"

"github.com/pingcap/tidb/statistics"
"github.com/stretchr/testify/require"
)

// AssertTableEqual is to assert whether two table is equal
func AssertTableEqual(t *testing.T, a *statistics.Table, b *statistics.Table) {
require.Equal(t, b.RealtimeCount, a.RealtimeCount)
require.Equal(t, b.ModifyCount, a.ModifyCount)
require.Len(t, a.Columns, len(b.Columns))
for i := range a.Columns {
require.True(t, statistics.HistogramEqual(&a.Columns[i].Histogram, &b.Columns[i].Histogram, false))
if a.Columns[i].CMSketch == nil {
require.Nil(t, b.Columns[i].CMSketch)
} else {
require.True(t, a.Columns[i].CMSketch.Equal(b.Columns[i].CMSketch))
}
// The nil case has been considered in (*TopN).Equal() so we don't need to consider it here.
require.Truef(t, a.Columns[i].TopN.Equal(b.Columns[i].TopN), "%v, %v", a.Columns[i].TopN, b.Columns[i].TopN)
}
require.Len(t, a.Indices, len(b.Indices))
for i := range a.Indices {
require.True(t, statistics.HistogramEqual(&a.Indices[i].Histogram, &b.Indices[i].Histogram, false))
if a.Indices[i].CMSketch == nil {
require.Nil(t, b.Indices[i].CMSketch)
} else {
require.True(t, a.Indices[i].CMSketch.Equal(b.Indices[i].CMSketch))
}
require.True(t, a.Indices[i].TopN.Equal(b.Indices[i].TopN))
}
require.True(t, IsSameExtendedStats(a.ExtendedStats, b.ExtendedStats))
}

// IsSameExtendedStats is to judge whether the extended states is the same.
func IsSameExtendedStats(a, b *statistics.ExtendedStatsColl) bool {
aEmpty := (a == nil) || len(a.Stats) == 0
bEmpty := (b == nil) || len(b.Stats) == 0
if (aEmpty && !bEmpty) || (!aEmpty && bEmpty) {
return false
}
if aEmpty && bEmpty {
return true
}
if len(a.Stats) != len(b.Stats) {
return false
}
for aKey, aItem := range a.Stats {
bItem, ok := b.Stats[aKey]
if !ok {
return false
}
for i, id := range aItem.ColIDs {
if id != bItem.ColIDs[i] {
return false
}
}
if (aItem.Tp != bItem.Tp) || (aItem.ScalarVals != bItem.ScalarVals) || (aItem.StringVals != bItem.StringVals) {
return false
}
}
return true
}
Loading