Skip to content

Commit

Permalink
scheduler: skip evict-leader-scheduler when setting schedule deny lab…
Browse files Browse the repository at this point in the history
…el (tikv#8303)

ref tikv#7300, close tikv#7853

- add a real cluster test to test `skip evict-leader-scheduler when setting schedule deny label`
- add `DeleteStoreLabel` API and `DeleteScheduler` API

Signed-off-by: okJiang <819421878@qq.com>
  • Loading branch information
okJiang authored and rleungx committed Sep 10, 2024
1 parent d71a1a3 commit 3837451
Show file tree
Hide file tree
Showing 5 changed files with 218 additions and 18 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@ coverage.xml
coverage
*.txt
go.work*
embedded_assets_handler.go
*.log
38 changes: 20 additions & 18 deletions pkg/schedule/schedulers/scheduler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,8 @@ func (s *ScheduleController) Stop() {

// Schedule tries to create some operators.
func (s *ScheduleController) Schedule(diagnosable bool) []*operator.Operator {
_, isEvictLeaderScheduler := s.Scheduler.(*evictLeaderScheduler)
retry:
for i := 0; i < maxScheduleRetries; i++ {
// no need to retry if schedule should stop to speed exit
select {
Expand All @@ -466,29 +468,29 @@ func (s *ScheduleController) Schedule(diagnosable bool) []*operator.Operator {
if diagnosable {
s.diagnosticRecorder.SetResultFromPlans(ops, plans)
}
foundDisabled := false
if len(ops) == 0 {
continue
}
// If we have schedule, reset interval to the minimal interval.
s.nextInterval = s.Scheduler.GetMinInterval()
for _, op := range ops {
if labelMgr := s.cluster.GetRegionLabeler(); labelMgr != nil {
region := s.cluster.GetRegion(op.RegionID())
if region == nil {
continue
}
if labelMgr.ScheduleDisabled(region) {
denySchedulersByLabelerCounter.Inc()
foundDisabled = true
break
}
region := s.cluster.GetRegion(op.RegionID())
if region == nil {
continue retry
}
}
if len(ops) > 0 {
// If we have schedule, reset interval to the minimal interval.
s.nextInterval = s.Scheduler.GetMinInterval()
// try regenerating operators
if foundDisabled {
labelMgr := s.cluster.GetRegionLabeler()
if labelMgr == nil {
continue
}
return ops

// If the evict-leader-scheduler is disabled, it will obstruct the restart operation of tikv by the operator.
// Refer: https://docs.pingcap.com/tidb-in-kubernetes/stable/restart-a-tidb-cluster#perform-a-graceful-restart-to-a-single-tikv-pod
if labelMgr.ScheduleDisabled(region) && !isEvictLeaderScheduler {
denySchedulersByLabelerCounter.Inc()
continue retry
}
}
return ops
}
s.nextInterval = s.Scheduler.GetNextInterval(s.nextInterval)
return nil
Expand Down
3 changes: 3 additions & 0 deletions server/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1364,6 +1364,9 @@ func (c *RaftCluster) DeleteStoreLabel(storeID uint64, labelKey string) error {
if store == nil {
return errs.ErrInvalidStoreID.FastGenByArgs(storeID)
}
if len(store.GetLabels()) == 0 {
return errors.Errorf("the label key %s does not exist", labelKey)
}
newStore := typeutil.DeepClone(store.GetMeta(), core.StoreFactory)
labels := make([]*metapb.StoreLabel, 0, len(newStore.GetLabels())-1)
for _, label := range newStore.GetLabels() {
Expand Down
5 changes: 5 additions & 0 deletions tools/pd-api-bench/pd.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[schedule]
patrol-region-interval = "100ms"

[log]
level = "debug"
188 changes: 188 additions & 0 deletions tools/pd-api-bench/scheduler_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
// Copyright 2024 TiKV Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package realcluster

import (
"context"
"fmt"
"sort"
"testing"
"time"

"github.com/stretchr/testify/require"
pd "github.com/tikv/pd/client/http"
"github.com/tikv/pd/client/testutil"
"github.com/tikv/pd/pkg/schedule/labeler"
"github.com/tikv/pd/pkg/schedule/schedulers"
)

// https://github.com/tikv/pd/issues/6988#issuecomment-1694924611
// https://github.com/tikv/pd/issues/6897
func TestTransferLeader(t *testing.T) {
re := require.New(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

resp, err := pdHTTPCli.GetLeader(ctx)
re.NoError(err)
oldLeader := resp.Name

var newLeader string
for i := 0; i < 2; i++ {
if resp.Name != fmt.Sprintf("pd-%d", i) {
newLeader = fmt.Sprintf("pd-%d", i)
}
}

// record scheduler
re.NoError(pdHTTPCli.CreateScheduler(ctx, schedulers.EvictLeaderName, 1))
defer func() {
re.NoError(pdHTTPCli.DeleteScheduler(ctx, schedulers.EvictLeaderName))
}()
res, err := pdHTTPCli.GetSchedulers(ctx)
re.NoError(err)
oldSchedulersLen := len(res)

re.NoError(pdHTTPCli.TransferLeader(ctx, newLeader))
// wait for transfer leader to new leader
time.Sleep(1 * time.Second)
resp, err = pdHTTPCli.GetLeader(ctx)
re.NoError(err)
re.Equal(newLeader, resp.Name)

res, err = pdHTTPCli.GetSchedulers(ctx)
re.NoError(err)
re.Len(res, oldSchedulersLen)

// transfer leader to old leader
re.NoError(pdHTTPCli.TransferLeader(ctx, oldLeader))
// wait for transfer leader
time.Sleep(1 * time.Second)
resp, err = pdHTTPCli.GetLeader(ctx)
re.NoError(err)
re.Equal(oldLeader, resp.Name)

res, err = pdHTTPCli.GetSchedulers(ctx)
re.NoError(err)
re.Len(res, oldSchedulersLen)
}

func TestRegionLabelDenyScheduler(t *testing.T) {
re := require.New(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

regions, err := pdHTTPCli.GetRegions(ctx)
re.NoError(err)
re.GreaterOrEqual(len(regions.Regions), 1)
region1 := regions.Regions[0]

err = pdHTTPCli.DeleteScheduler(ctx, schedulers.BalanceLeaderName)
if err == nil {
defer func() {
pdHTTPCli.CreateScheduler(ctx, schedulers.BalanceLeaderName, 0)
}()
}

re.NoError(pdHTTPCli.CreateScheduler(ctx, schedulers.GrantLeaderName, uint64(region1.Leader.StoreID)))
defer func() {
pdHTTPCli.DeleteScheduler(ctx, schedulers.GrantLeaderName)
}()

// wait leader transfer
testutil.Eventually(re, func() bool {
regions, err := pdHTTPCli.GetRegions(ctx)
re.NoError(err)
for _, region := range regions.Regions {
if region.Leader.StoreID != region1.Leader.StoreID {
return false
}
}
return true
}, testutil.WithWaitFor(time.Minute))

// disable schedule for region1
labelRule := &pd.LabelRule{
ID: "rule1",
Labels: []pd.RegionLabel{{Key: "schedule", Value: "deny"}},
RuleType: "key-range",
Data: labeler.MakeKeyRanges(region1.StartKey, region1.EndKey),
}
re.NoError(pdHTTPCli.SetRegionLabelRule(ctx, labelRule))
defer func() {
pdHTTPCli.PatchRegionLabelRules(ctx, &pd.LabelRulePatch{DeleteRules: []string{labelRule.ID}})
}()
labelRules, err := pdHTTPCli.GetAllRegionLabelRules(ctx)
re.NoError(err)
re.Len(labelRules, 2)
sort.Slice(labelRules, func(i, j int) bool {
return labelRules[i].ID < labelRules[j].ID
})
re.Equal(labelRule.ID, labelRules[1].ID)
re.Equal(labelRule.Labels, labelRules[1].Labels)
re.Equal(labelRule.RuleType, labelRules[1].RuleType)

// enable evict leader scheduler, and check it works
re.NoError(pdHTTPCli.DeleteScheduler(ctx, schedulers.GrantLeaderName))
re.NoError(pdHTTPCli.CreateScheduler(ctx, schedulers.EvictLeaderName, uint64(region1.Leader.StoreID)))
defer func() {
pdHTTPCli.DeleteScheduler(ctx, schedulers.EvictLeaderName)
}()
testutil.Eventually(re, func() bool {
regions, err := pdHTTPCli.GetRegions(ctx)
re.NoError(err)
for _, region := range regions.Regions {
if region.Leader.StoreID == region1.Leader.StoreID {
return false
}
}
return true
}, testutil.WithWaitFor(time.Minute))

re.NoError(pdHTTPCli.DeleteScheduler(ctx, schedulers.EvictLeaderName))
re.NoError(pdHTTPCli.CreateScheduler(ctx, schedulers.GrantLeaderName, uint64(region1.Leader.StoreID)))
defer func() {
pdHTTPCli.DeleteScheduler(ctx, schedulers.GrantLeaderName)
}()
testutil.Eventually(re, func() bool {
regions, err := pdHTTPCli.GetRegions(ctx)
re.NoError(err)
for _, region := range regions.Regions {
if region.ID == region1.ID {
continue
}
if region.Leader.StoreID != region1.Leader.StoreID {
return false
}
}
return true
}, testutil.WithWaitFor(time.Minute))

pdHTTPCli.PatchRegionLabelRules(ctx, &pd.LabelRulePatch{DeleteRules: []string{labelRule.ID}})
labelRules, err = pdHTTPCli.GetAllRegionLabelRules(ctx)
re.NoError(err)
re.Len(labelRules, 1)

testutil.Eventually(re, func() bool {
regions, err := pdHTTPCli.GetRegions(ctx)
re.NoError(err)
for _, region := range regions.Regions {
if region.Leader.StoreID != region1.Leader.StoreID {
return false
}
}
return true
}, testutil.WithWaitFor(time.Minute))
}

0 comments on commit 3837451

Please sign in to comment.