planner: update the scan-row-size calculation formula in model2 (#38968)

ref #35240
pingcap · Nov 9, 2022 · ce62915 · ce62915
1 parent 10115e0
commit ce62915
Show file tree

Hide file tree

Showing 7 changed files with 217 additions and 172 deletions.
diff --git a/planner/core/plan_cost_ver1.go b/planner/core/plan_cost_ver1.go
@@ -852,7 +852,7 @@ func (p *PhysicalHashJoin) GetCost(lCnt, rCnt float64, isMPP bool, costFlag uint
 	sessVars := p.ctx.GetSessionVars()
 	oomUseTmpStorage := variable.EnableTmpStorageOnOOM.Load()
 	memQuota := sessVars.MemTracker.GetBytesLimit() // sessVars.MemQuotaQuery && hint
-	rowSize := getAvgRowSize(build.statsInfo(), build.Schema())
+	rowSize := getAvgRowSize(build.statsInfo(), build.Schema().Columns)
 	spill := oomUseTmpStorage && memQuota > 0 && rowSize*buildCnt > float64(memQuota) && p.storeTp != kv.TiFlash
 	// Cost of building hash table.
 	cpuFactor := sessVars.GetCPUFactor()
@@ -1049,7 +1049,7 @@ func (p *PhysicalSort) GetCost(count float64, schema *expression.Schema) float64
 
 	oomUseTmpStorage := variable.EnableTmpStorageOnOOM.Load()
 	memQuota := sessVars.MemTracker.GetBytesLimit() // sessVars.MemQuotaQuery && hint
-	rowSize := getAvgRowSize(p.statsInfo(), schema)
+	rowSize := getAvgRowSize(p.statsInfo(), schema.Columns)
 	spill := oomUseTmpStorage && memQuota > 0 && rowSize*count > float64(memQuota)
 	diskCost := count * sessVars.GetDiskFactor() * rowSize
 	if !spill {

diff --git a/planner/core/plan_cost_ver2.go b/planner/core/plan_cost_ver2.go
@@ -115,7 +115,7 @@ func (p *PhysicalIndexScan) getPlanCostVer2(taskType property.TaskType, option *
 	}
 
 	rows := getCardinality(p, option.CostFlag)
-	rowSize := math.Max(p.getScanRowSize(), 2.0)
+	rowSize := math.Max(getAvgRowSize(p.stats, p.schema.Columns), 2.0) // consider all index columns
 	scanFactor := getTaskScanFactorVer2(p, taskType)
 
 	p.planCostVer2 = scanCostVer2(option, rows, rowSize, scanFactor)
@@ -132,7 +132,13 @@ func (p *PhysicalTableScan) getPlanCostVer2(taskType property.TaskType, option *
 	}
 
 	rows := getCardinality(p, option.CostFlag)
-	rowSize := math.Max(p.getScanRowSize(), 2.0)
+	var rowSize float64
+	if p.StoreType == kv.TiKV {
+		rowSize = getAvgRowSize(p.stats, p.tblCols) // consider all columns if TiKV
+	} else { // TiFlash
+		rowSize = getAvgRowSize(p.stats, p.schema.Columns)
+	}
+	rowSize = math.Max(rowSize, 2.0)
 	scanFactor := getTaskScanFactorVer2(p, taskType)
 
 	p.planCostVer2 = scanCostVer2(option, rows, rowSize, scanFactor)
@@ -155,7 +161,7 @@ func (p *PhysicalIndexReader) getPlanCostVer2(taskType property.TaskType, option
 	}
 
 	rows := getCardinality(p.indexPlan, option.CostFlag)
-	rowSize := getAvgRowSize(p.indexPlan.Stats(), p.indexPlan.Schema())
+	rowSize := getAvgRowSize(p.indexPlan.Stats(), p.indexPlan.Schema().Columns)
 	netFactor := getTaskNetFactorVer2(p, taskType)
 	concurrency := float64(p.ctx.GetSessionVars().DistSQLScanConcurrency())
 
@@ -180,7 +186,7 @@ func (p *PhysicalTableReader) getPlanCostVer2(taskType property.TaskType, option
 	}
 
 	rows := getCardinality(p.tablePlan, option.CostFlag)
-	rowSize := getAvgRowSize(p.tablePlan.Stats(), p.tablePlan.Schema())
+	rowSize := getAvgRowSize(p.tablePlan.Stats(), p.tablePlan.Schema().Columns)
 	netFactor := getTaskNetFactorVer2(p, taskType)
 	concurrency := float64(p.ctx.GetSessionVars().DistSQLScanConcurrency())
 	childType := property.CopSingleReadTaskType
@@ -284,7 +290,7 @@ func (p *PhysicalIndexMergeReader) getPlanCostVer2(taskType property.TaskType, o
 	var tableSideCost costVer2
 	if tablePath := p.tablePlan; tablePath != nil {
 		rows := getCardinality(tablePath, option.CostFlag)
-		rowSize := getAvgRowSize(tablePath.Stats(), tablePath.Schema())
+		rowSize := getAvgRowSize(tablePath.Stats(), tablePath.Schema().Columns)
 
 		tableNetCost := netCostVer2(option, rows, rowSize, netFactor)
 		tableChildCost, err := tablePath.getPlanCostVer2(taskType, option)
@@ -297,7 +303,7 @@ func (p *PhysicalIndexMergeReader) getPlanCostVer2(taskType property.TaskType, o
 	indexSideCost := make([]costVer2, 0, len(p.partialPlans))
 	for _, indexPath := range p.partialPlans {
 		rows := getCardinality(indexPath, option.CostFlag)
-		rowSize := getAvgRowSize(indexPath.Stats(), indexPath.Schema())
+		rowSize := getAvgRowSize(indexPath.Stats(), indexPath.Schema().Columns)
 
 		indexNetCost := netCostVer2(option, rows, rowSize, netFactor)
 		indexChildCost, err := indexPath.getPlanCostVer2(taskType, option)
@@ -329,7 +335,7 @@ func (p *PhysicalSort) getPlanCostVer2(taskType property.TaskType, option *PlanC
 	}
 
 	rows := math.Max(getCardinality(p.children[0], option.CostFlag), 1)
-	rowSize := getAvgRowSize(p.statsInfo(), p.Schema())
+	rowSize := getAvgRowSize(p.statsInfo(), p.Schema().Columns)
 	cpuFactor := getTaskCPUFactorVer2(p, taskType)
 	memFactor := getTaskMemFactorVer2(p, taskType)
 	diskFactor := defaultVer2Factors.TiDBDisk
@@ -378,7 +384,7 @@ func (p *PhysicalTopN) getPlanCostVer2(taskType property.TaskType, option *PlanC
 
 	rows := getCardinality(p.children[0], option.CostFlag)
 	N := math.Max(1, float64(p.Count+p.Offset))
-	rowSize := getAvgRowSize(p.statsInfo(), p.Schema())
+	rowSize := getAvgRowSize(p.statsInfo(), p.Schema().Columns)
 	cpuFactor := getTaskCPUFactorVer2(p, taskType)
 	memFactor := getTaskMemFactorVer2(p, taskType)
 
@@ -429,7 +435,7 @@ func (p *PhysicalHashAgg) getPlanCostVer2(taskType property.TaskType, option *Pl
 
 	inputRows := getCardinality(p.children[0], option.CostFlag)
 	outputRows := getCardinality(p, option.CostFlag)
-	outputRowSize := getAvgRowSize(p.Stats(), p.Schema())
+	outputRowSize := getAvgRowSize(p.Stats(), p.Schema().Columns)
 	cpuFactor := getTaskCPUFactorVer2(p, taskType)
 	memFactor := getTaskMemFactorVer2(p, taskType)
 	concurrency := float64(p.ctx.GetSessionVars().HashAggFinalConcurrency())
@@ -501,7 +507,7 @@ func (p *PhysicalHashJoin) getPlanCostVer2(taskType property.TaskType, option *P
 	}
 	buildRows := getCardinality(build, option.CostFlag)
 	probeRows := getCardinality(probe, option.CostFlag)
-	buildRowSize := getAvgRowSize(build.Stats(), build.Schema())
+	buildRowSize := getAvgRowSize(build.Stats(), build.Schema().Columns)
 	tidbConcurrency := float64(p.Concurrency)
 	mppConcurrency := float64(3) // TODO: remove this empirical value
 	cpuFactor := getTaskCPUFactorVer2(p, taskType)
@@ -645,7 +651,7 @@ func (p *PhysicalExchangeReceiver) getPlanCostVer2(taskType property.TaskType, o
 	}
 
 	rows := getCardinality(p, option.CostFlag)
-	rowSize := getAvgRowSize(p.stats, p.Schema())
+	rowSize := getAvgRowSize(p.stats, p.Schema().Columns)
 	netFactor := getTaskNetFactorVer2(p, taskType)
 	isBCast := false
 	if sender, ok := p.children[0].(*PhysicalExchangeSender); ok {
@@ -678,7 +684,7 @@ func (p *PointGetPlan) getPlanCostVer2(taskType property.TaskType, option *PlanC
 		p.planCostInit = true
 		return zeroCostVer2, nil
 	}
-	rowSize := getAvgRowSize(p.stats, p.schema)
+	rowSize := getAvgRowSize(p.stats, p.schema.Columns)
 	netFactor := getTaskNetFactorVer2(p, taskType)
 
 	p.planCostVer2 = netCostVer2(option, 1, rowSize, netFactor)
@@ -698,7 +704,7 @@ func (p *BatchPointGetPlan) getPlanCostVer2(taskType property.TaskType, option *
 		return zeroCostVer2, nil
 	}
 	rows := getCardinality(p, option.CostFlag)
-	rowSize := getAvgRowSize(p.stats, p.schema)
+	rowSize := getAvgRowSize(p.stats, p.schema.Columns)
 	netFactor := getTaskNetFactorVer2(p, taskType)
 
 	p.planCostVer2 = netCostVer2(option, rows, rowSize, netFactor)

diff --git a/planner/core/plan_cost_ver2_test.go b/planner/core/plan_cost_ver2_test.go
@@ -142,15 +142,55 @@ func TestCostModelShowFormula(t *testing.T) {
 	actual := make([][]interface{}, 0, len(plan))
 	for _, row := range plan {
 		actual = append(actual, []interface{}{row[0], row[3]}) // id,costFormula
-		fmt.Println(actual)
 	}
 	require.Equal(t, actual, [][]interface{}{
 		{"TableReader_7", "((Selection_6) + (net(2*rowsize(16)*tidb_kv_net_factor(3.96))))/15"},
 		{"└─Selection_6", "(cpu(3*filters(1)*tikv_cpu_factor(49.9))) + (TableFullScan_5)"},
-		{"  └─TableFullScan_5", "scan(3*logrowsize(29)*tikv_scan_factor(40.7))"},
+		{"  └─TableFullScan_5", "scan(3*logrowsize(32)*tikv_scan_factor(40.7))"},
 	})
 }
 
+func TestCostModelVer2ScanRowSize(t *testing.T) {
+	store := testkit.CreateMockStore(t)
+	tk := testkit.NewTestKit(t, store)
+	tk.MustExec("use test")
+	tk.MustExec(`create table t (pk int, a int, b int, c int, d int, primary key(pk), index ab(a, b), index abc(a, b, c))`)
+	tk.MustExec("insert into t values (1, 1, 1, 1, 1)")
+	tk.MustExec(`set @@tidb_cost_model_version=2`)
+
+	cases := []struct {
+		query       string
+		scanFormula string
+	}{
+		// index scan row-size on idx_ab is always equal to row-size(index_ab)
+		{"select a from t use index(ab) where a=1", "scan(1*logrowsize(32)*tikv_scan_factor(40.7))"},
+		{"select a, b from t use index(ab) where a=1", "scan(1*logrowsize(32)*tikv_scan_factor(40.7))"},
+		{"select b from t use index(ab) where a=1 and b=1", "scan(1*logrowsize(32)*tikv_scan_factor(40.7))"},
+		// index scan row-size on idx_abc is always equal to row-size(index_abc)
+		{"select a from t use index(abc) where a=1", "scan(1*logrowsize(48)*tikv_scan_factor(40.7))"},
+		{"select a from t use index(abc) where a=1 and b=1", "scan(1*logrowsize(48)*tikv_scan_factor(40.7))"},
+		{"select a, b from t use index(abc) where a=1 and b=1", "scan(1*logrowsize(48)*tikv_scan_factor(40.7))"},
+		{"select a, b, c from t use index(abc) where a=1 and b=1 and c=1", "scan(1*logrowsize(48)*tikv_scan_factor(40.7))"},
+		// table scan row-size is always equal to row-size(*)
+		{"select a from t use index(primary) where a=1", "scan(1*logrowsize(80)*tikv_scan_factor(40.7))"},
+		{"select a, d from t use index(primary) where a=1", "scan(1*logrowsize(80)*tikv_scan_factor(40.7))"},
+		{"select * from t use index(primary) where a=1", "scan(1*logrowsize(80)*tikv_scan_factor(40.7))"},
+	}
+	for _, c := range cases {
+		rs := tk.MustQuery("explain analyze format=true_card_cost " + c.query).Rows()
+		scan := rs[len(rs)-1]
+		formula := scan[3]
+		require.Equal(t, formula, c.scanFormula)
+	}
+
+	tk.MustQuery("explain select a from t where a=1").Check(testkit.Rows(
+		`IndexReader_6 10.00 root  index:IndexRangeScan_5`, // use idx_ab automatically since it has the smallest row-size in all access paths.
+		`└─IndexRangeScan_5 10.00 cop[tikv] table:t, index:ab(a, b) range:[1,1], keep order:false, stats:pseudo`))
+	tk.MustQuery("explain select a, b, c from t where a=1").Check(testkit.Rows(
+		`IndexReader_6 10.00 root  index:IndexRangeScan_5`, // use idx_abc automatically
+		`└─IndexRangeScan_5 10.00 cop[tikv] table:t, index:abc(a, b, c) range:[1,1], keep order:false, stats:pseudo`))
+}
+
 func TestCostModelTraceVer2(t *testing.T) {
 	store := testkit.CreateMockStore(t)
 	tk := testkit.NewTestKit(t, store)

diff --git a/planner/core/task.go b/planner/core/task.go
@@ -297,12 +297,11 @@ func (p *PhysicalIndexJoin) attach2Task(tasks ...task) task {
 	return t
 }
 
-func getAvgRowSize(stats *property.StatsInfo, schema *expression.Schema) (size float64) {
+func getAvgRowSize(stats *property.StatsInfo, cols []*expression.Column) (size float64) {
 	if stats.HistColl != nil {
-		size = stats.HistColl.GetAvgRowSizeListInDisk(schema.Columns)
+		size = stats.HistColl.GetAvgRowSizeListInDisk(cols)
 	} else {
 		// Estimate using just the type info.
-		cols := schema.Columns
 		for _, col := range cols {
 			size += float64(chunk.EstimateTypeWidth(col.GetType()))
 		}