Skip to content

Commit 04f9b3c

Browse files
authored
*: add UseAutoScaler config to disable AutoScaler (#40966)
close #40971
1 parent c215108 commit 04f9b3c

File tree

8 files changed

+168
-4
lines changed

8 files changed

+168
-4
lines changed

config/config.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,8 @@ type Config struct {
295295
TiFlashComputeAutoScalerAddr string `toml:"autoscaler-addr" json:"autoscaler-addr"`
296296
IsTiFlashComputeFixedPool bool `toml:"is-tiflashcompute-fixed-pool" json:"is-tiflashcompute-fixed-pool"`
297297
AutoScalerClusterID string `toml:"autoscaler-cluster-id" json:"autoscaler-cluster-id"`
298+
// todo: remove this after AutoScaler is stable.
299+
UseAutoScaler bool `toml:"use-autoscaler" json:"use-autoscaler"`
298300

299301
// TiDBMaxReuseChunk indicates max cached chunk num
300302
TiDBMaxReuseChunk uint32 `toml:"tidb-max-reuse-chunk" json:"tidb-max-reuse-chunk"`
@@ -1012,6 +1014,7 @@ var defaultConf = Config{
10121014
TiFlashComputeAutoScalerAddr: tiflashcompute.DefAWSAutoScalerAddr,
10131015
IsTiFlashComputeFixedPool: false,
10141016
AutoScalerClusterID: "",
1017+
UseAutoScaler: true,
10151018
TiDBMaxReuseChunk: 64,
10161019
TiDBMaxReuseColumn: 256,
10171020
TiDBEnableExitCheck: false,
@@ -1348,7 +1351,7 @@ func (c *Config) Valid() error {
13481351
}
13491352

13501353
// Check tiflash_compute topo fetch is valid.
1351-
if c.DisaggregatedTiFlash {
1354+
if c.DisaggregatedTiFlash && c.UseAutoScaler {
13521355
if !tiflashcompute.IsValidAutoScalerConfig(c.TiFlashComputeAutoScalerType) {
13531356
return fmt.Errorf("invalid AutoScaler type, expect %s, %s or %s, got %s",
13541357
tiflashcompute.MockASStr, tiflashcompute.AWSASStr, tiflashcompute.GCPASStr, c.TiFlashComputeAutoScalerType)

config/config.toml.example

+4
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,10 @@ autoscaler-addr = "tiflash-autoscale-lb.tiflash-autoscale.svc.cluster.local:8081
129129
# Only meaningful when disaggregated-tiflash is true.
130130
autoscaler-cluster-id = ""
131131

132+
# use-autoscaler indicates whether use AutoScaler or PD for tiflash_compute nodes, only meaningful when disaggregated-tiflash is true.
133+
# Will remove this after AutoScaler is stable.
134+
use-autoscaler = true
135+
132136
[log]
133137
# Log level: debug, info, warn, error, fatal.
134138
level = "info"

executor/tiflashtest/tiflash_test.go

+31
Original file line numberDiff line numberDiff line change
@@ -1319,6 +1319,37 @@ func TestDisaggregatedTiFlash(t *testing.T) {
13191319
require.Contains(t, err.Error(), "[util:1815]Internal : get tiflash_compute topology failed")
13201320
}
13211321

1322+
// todo: remove this after AutoScaler is stable.
1323+
func TestDisaggregatedTiFlashNonAutoScaler(t *testing.T) {
1324+
config.UpdateGlobal(func(conf *config.Config) {
1325+
conf.DisaggregatedTiFlash = true
1326+
conf.UseAutoScaler = false
1327+
})
1328+
defer config.UpdateGlobal(func(conf *config.Config) {
1329+
conf.DisaggregatedTiFlash = false
1330+
conf.UseAutoScaler = true
1331+
})
1332+
1333+
// Setting globalTopoFetcher to nil to can make sure cannot fetch topo from AutoScaler.
1334+
err := tiflashcompute.InitGlobalTopoFetcher(tiflashcompute.InvalidASStr, "", "", false)
1335+
require.Contains(t, err.Error(), "unexpected topo fetch type. expect: mock or aws or gcp, got invalid")
1336+
1337+
store := testkit.CreateMockStore(t, withMockTiFlash(2))
1338+
tk := testkit.NewTestKit(t, store)
1339+
tk.MustExec("use test")
1340+
tk.MustExec("drop table if exists t")
1341+
tk.MustExec("create table t(c1 int)")
1342+
tk.MustExec("alter table t set tiflash replica 1")
1343+
tb := external.GetTableByName(t, tk, "test", "t")
1344+
err = domain.GetDomain(tk.Session()).DDL().UpdateTableReplicaInfo(tk.Session(), tb.Meta().ID, true)
1345+
require.NoError(t, err)
1346+
tk.MustExec("set @@session.tidb_isolation_read_engines=\"tiflash\"")
1347+
1348+
err = tk.ExecToErr("select * from t;")
1349+
// This error message means we use PD instead of AutoScaler.
1350+
require.Contains(t, err.Error(), "tiflash_compute node is unavailable")
1351+
}
1352+
13221353
func TestDisaggregatedTiFlashQuery(t *testing.T) {
13231354
config.UpdateGlobal(func(conf *config.Config) {
13241355
conf.DisaggregatedTiFlash = true

session/session.go

+8
Original file line numberDiff line numberDiff line change
@@ -3339,6 +3339,14 @@ func BootstrapSession(store kv.Storage) (*domain.Domain, error) {
33393339
return nil, err
33403340
}
33413341

3342+
if config.GetGlobalConfig().DisaggregatedTiFlash && !config.GetGlobalConfig().UseAutoScaler {
3343+
// Invalid client-go tiflash_compute store cache if necessary.
3344+
err = dom.WatchTiFlashComputeNodeChange()
3345+
if err != nil {
3346+
return nil, err
3347+
}
3348+
}
3349+
33423350
if err = extensionimpl.Bootstrap(context.Background(), dom); err != nil {
33433351
return nil, err
33443352
}

store/copr/batch_coprocessor.go

+99-2
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,10 @@ func buildBatchCopTasksForNonPartitionedTable(
494494
balanceWithContinuity bool,
495495
balanceContinuousRegionCount int64) ([]*batchCopTask, error) {
496496
if config.GetGlobalConfig().DisaggregatedTiFlash {
497-
return buildBatchCopTasksConsistentHash(ctx, bo, store, []*KeyRanges{ranges}, storeType, ttl)
497+
if config.GetGlobalConfig().UseAutoScaler {
498+
return buildBatchCopTasksConsistentHash(ctx, bo, store, []*KeyRanges{ranges}, storeType, ttl)
499+
}
500+
return buildBatchCopTasksConsistentHashForPD(bo, store, []*KeyRanges{ranges}, storeType, ttl)
498501
}
499502
return buildBatchCopTasksCore(bo, store, []*KeyRanges{ranges}, storeType, isMPP, ttl, balanceWithContinuity, balanceContinuousRegionCount)
500503
}
@@ -511,7 +514,12 @@ func buildBatchCopTasksForPartitionedTable(
511514
balanceContinuousRegionCount int64,
512515
partitionIDs []int64) (batchTasks []*batchCopTask, err error) {
513516
if config.GetGlobalConfig().DisaggregatedTiFlash {
514-
batchTasks, err = buildBatchCopTasksConsistentHash(ctx, bo, store, rangesForEachPhysicalTable, storeType, ttl)
517+
if config.GetGlobalConfig().UseAutoScaler {
518+
batchTasks, err = buildBatchCopTasksConsistentHash(ctx, bo, store, rangesForEachPhysicalTable, storeType, ttl)
519+
} else {
520+
// todo: remove this after AutoScaler is stable.
521+
batchTasks, err = buildBatchCopTasksConsistentHashForPD(bo, store, rangesForEachPhysicalTable, storeType, ttl)
522+
}
515523
} else {
516524
batchTasks, err = buildBatchCopTasksCore(bo, store, rangesForEachPhysicalTable, storeType, isMPP, ttl, balanceWithContinuity, balanceContinuousRegionCount)
517525
}
@@ -1169,3 +1177,92 @@ func (b *batchCopIterator) handleCollectExecutionInfo(bo *Backoffer, resp *batch
11691177
}
11701178
resp.detail.CalleeAddress = task.storeAddr
11711179
}
1180+
1181+
// Only called when UseAutoScaler is false.
1182+
func buildBatchCopTasksConsistentHashForPD(bo *backoff.Backoffer,
1183+
kvStore *kvStore,
1184+
rangesForEachPhysicalTable []*KeyRanges,
1185+
storeType kv.StoreType,
1186+
ttl time.Duration) (res []*batchCopTask, err error) {
1187+
const cmdType = tikvrpc.CmdBatchCop
1188+
var retryNum int
1189+
cache := kvStore.GetRegionCache()
1190+
1191+
for {
1192+
retryNum++
1193+
var rangesLen int
1194+
tasks := make([]*copTask, 0)
1195+
regionIDs := make([]tikv.RegionVerID, 0)
1196+
1197+
for i, ranges := range rangesForEachPhysicalTable {
1198+
rangesLen += ranges.Len()
1199+
locations, err := cache.SplitKeyRangesByLocations(bo, ranges, UnspecifiedLimit)
1200+
if err != nil {
1201+
return nil, errors.Trace(err)
1202+
}
1203+
for _, lo := range locations {
1204+
tasks = append(tasks, &copTask{
1205+
region: lo.Location.Region,
1206+
ranges: lo.Ranges,
1207+
cmdType: cmdType,
1208+
storeType: storeType,
1209+
partitionIndex: int64(i),
1210+
})
1211+
regionIDs = append(regionIDs, lo.Location.Region)
1212+
}
1213+
}
1214+
1215+
stores, err := cache.GetTiFlashComputeStores(bo.TiKVBackoffer())
1216+
if err != nil {
1217+
return nil, err
1218+
}
1219+
stores = filterAliveStores(bo.GetCtx(), stores, ttl, kvStore)
1220+
if len(stores) == 0 {
1221+
return nil, errors.New("tiflash_compute node is unavailable")
1222+
}
1223+
1224+
rpcCtxs, err := cache.GetTiFlashComputeRPCContextByConsistentHash(bo.TiKVBackoffer(), regionIDs, stores)
1225+
if err != nil {
1226+
return nil, err
1227+
}
1228+
if rpcCtxs == nil {
1229+
logutil.BgLogger().Info("buildBatchCopTasksConsistentHash retry because rcpCtx is nil", zap.Int("retryNum", retryNum))
1230+
err := bo.Backoff(tikv.BoTiFlashRPC(), errors.New("Cannot find region with TiFlash peer"))
1231+
if err != nil {
1232+
return nil, errors.Trace(err)
1233+
}
1234+
continue
1235+
}
1236+
if len(rpcCtxs) != len(tasks) {
1237+
return nil, errors.Errorf("length should be equal, len(rpcCtxs): %d, len(tasks): %d", len(rpcCtxs), len(tasks))
1238+
}
1239+
taskMap := make(map[string]*batchCopTask)
1240+
for i, rpcCtx := range rpcCtxs {
1241+
regionInfo := RegionInfo{
1242+
// tasks and rpcCtxs are correspond to each other.
1243+
Region: tasks[i].region,
1244+
Meta: rpcCtx.Meta,
1245+
Ranges: tasks[i].ranges,
1246+
AllStores: []uint64{rpcCtx.Store.StoreID()},
1247+
PartitionIndex: tasks[i].partitionIndex,
1248+
}
1249+
if batchTask, ok := taskMap[rpcCtx.Addr]; ok {
1250+
batchTask.regionInfos = append(batchTask.regionInfos, regionInfo)
1251+
} else {
1252+
batchTask := &batchCopTask{
1253+
storeAddr: rpcCtx.Addr,
1254+
cmdType: cmdType,
1255+
ctx: rpcCtx,
1256+
regionInfos: []RegionInfo{regionInfo},
1257+
}
1258+
taskMap[rpcCtx.Addr] = batchTask
1259+
res = append(res, batchTask)
1260+
}
1261+
}
1262+
logutil.BgLogger().Info("buildBatchCopTasksConsistentHash done", zap.Any("len(tasks)", len(taskMap)), zap.Any("len(tiflash_compute)", len(stores)))
1263+
break
1264+
}
1265+
1266+
failpointCheckForConsistentHash(res)
1267+
return res, nil
1268+
}

store/copr/mpp.go

+18
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ func (m *mppIterator) handleDispatchReq(ctx context.Context, bo *Backoffer, req
250250
var rpcResp *tikvrpc.Response
251251
var err error
252252
var retry bool
253+
invalidPDCache := config.GetGlobalConfig().DisaggregatedTiFlash && !config.GetGlobalConfig().UseAutoScaler
253254

254255
// If copTasks is not empty, we should send request according to region distribution.
255256
// Or else it's the task without region, which always happens in high layer task without table.
@@ -262,6 +263,9 @@ func (m *mppIterator) handleDispatchReq(ctx context.Context, bo *Backoffer, req
262263
// That's a hard job but we can try it in the future.
263264
if sender.GetRPCError() != nil {
264265
logutil.BgLogger().Warn("mpp dispatch meet io error", zap.String("error", sender.GetRPCError().Error()), zap.Uint64("timestamp", taskMeta.StartTs), zap.Int64("task", taskMeta.TaskId), zap.Int64("mpp-version", taskMeta.MppVersion))
266+
if invalidPDCache {
267+
m.store.GetRegionCache().InvalidateTiFlashComputeStores()
268+
}
265269
// if needTriggerFallback is true, we return timeout to trigger tikv's fallback
266270
if m.needTriggerFallback {
267271
err = derr.ErrTiFlashServerTimeout
@@ -274,6 +278,9 @@ func (m *mppIterator) handleDispatchReq(ctx context.Context, bo *Backoffer, req
274278
if errors.Cause(err) == context.Canceled || status.Code(errors.Cause(err)) == codes.Canceled {
275279
retry = false
276280
} else if err != nil {
281+
if invalidPDCache {
282+
m.store.GetRegionCache().InvalidateTiFlashComputeStores()
283+
}
277284
if bo.Backoff(tikv.BoTiFlashRPC(), err) == nil {
278285
retry = true
279286
}
@@ -351,18 +358,26 @@ func (m *mppIterator) cancelMppTasks() {
351358
}
352359

353360
// send cancel cmd to all stores where tasks run
361+
invalidPDCache := config.GetGlobalConfig().DisaggregatedTiFlash && !config.GetGlobalConfig().UseAutoScaler
354362
wg := util.WaitGroupWrapper{}
363+
gotErr := atomic.Bool{}
355364
for addr := range usedStoreAddrs {
356365
storeAddr := addr
357366
wg.Run(func() {
358367
_, err := m.store.GetTiKVClient().SendRequest(context.Background(), storeAddr, wrappedReq, tikv.ReadTimeoutShort)
359368
logutil.BgLogger().Debug("cancel task", zap.Uint64("query id ", m.startTs), zap.String("on addr", storeAddr), zap.Int64("mpp-version", m.mppVersion.ToInt64()))
360369
if err != nil {
361370
logutil.BgLogger().Error("cancel task error", zap.Error(err), zap.Uint64("query id", m.startTs), zap.String("on addr", storeAddr), zap.Int64("mpp-version", m.mppVersion.ToInt64()))
371+
if invalidPDCache {
372+
gotErr.CompareAndSwap(false, true)
373+
}
362374
}
363375
})
364376
}
365377
wg.Wait()
378+
if invalidPDCache && gotErr.Load() {
379+
m.store.GetRegionCache().InvalidateTiFlashComputeStores()
380+
}
366381
}
367382

368383
func (m *mppIterator) establishMPPConns(bo *Backoffer, req *kv.MPPDispatchRequest, taskMeta *mpp.TaskMeta) {
@@ -389,6 +404,9 @@ func (m *mppIterator) establishMPPConns(bo *Backoffer, req *kv.MPPDispatchReques
389404

390405
if err != nil {
391406
logutil.BgLogger().Warn("establish mpp connection meet error and cannot retry", zap.String("error", err.Error()), zap.Uint64("timestamp", taskMeta.StartTs), zap.Int64("task", taskMeta.TaskId), zap.Int64("mpp-version", taskMeta.MppVersion))
407+
if config.GetGlobalConfig().DisaggregatedTiFlash && !config.GetGlobalConfig().UseAutoScaler {
408+
m.store.GetRegionCache().InvalidateTiFlashComputeStores()
409+
}
392410
// if needTriggerFallback is true, we return timeout to trigger tikv's fallback
393411
if m.needTriggerFallback {
394412
m.sendError(derr.ErrTiFlashServerTimeout)

tidb-server/main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ func main() {
204204
err := cpuprofile.StartCPUProfiler()
205205
terror.MustNil(err)
206206

207-
if config.GetGlobalConfig().DisaggregatedTiFlash {
207+
if config.GetGlobalConfig().DisaggregatedTiFlash && config.GetGlobalConfig().UseAutoScaler {
208208
clusterID, err := config.GetAutoScalerClusterID()
209209
terror.MustNil(err)
210210

util/tiflashcompute/topo_fetcher.go

+3
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ const (
4444
GCPASStr = "gcp"
4545
// TestASStr is string value for test AutoScaler.
4646
TestASStr = "test"
47+
// InvalidASStr is string value for invalid AutoScaler.
48+
InvalidASStr = "invalid"
4749
)
4850

4951
const (
@@ -127,6 +129,7 @@ func InitGlobalTopoFetcher(typ string, addr string, clusterID string, isFixedPoo
127129
case TestASType:
128130
globalTopoFetcher = NewTestAutoScalerFetcher()
129131
default:
132+
globalTopoFetcher = nil
130133
err = errors.Errorf("unexpected topo fetch type. expect: %s or %s or %s, got %s",
131134
MockASStr, AWSASStr, GCPASStr, typ)
132135
}

0 commit comments

Comments
 (0)