From bb4b5c7ae664bdbe083943f8a17e71df2a5df499 Mon Sep 17 00:00:00 2001
From: Suhaha <jklopsdfw@gmail.com>
Date: Tue, 12 Oct 2021 12:01:26 +0800
Subject: [PATCH 1/6] dashboard: update TiDB Dashboard to v2021.10.08.1 (#4070)

* Update to TiDB Dashboard v2021.09.02.2

Signed-off-by: Suhaha <jklopsdfw@gmail.com>

* update: dashboard to v2021.09.03.1 & dashboard distro populate

Signed-off-by: Suhaha <jklopsdfw@gmail.com>

* Update dashboard to v2021.09.27.1

Signed-off-by: Suhaha <jklopsdfw@gmail.com>

* chore: go mod tidy

Signed-off-by: Suhaha <jklopsdfw@gmail.com>

* Update dashboard to v2021.10.08.1

Signed-off-by: Suhaha <jklopsdfw@gmail.com>
---
 go.mod                                                      | 2 +-
 go.sum                                                      | 4 ++--
 pkg/dashboard/distro/distro.go                              | 3 +++
 .../distro/{distro_info_placeholder.go => placeholder.go}   | 6 ------
 4 files changed, 6 insertions(+), 9 deletions(-)
 rename pkg/dashboard/distro/{distro_info_placeholder.go => placeholder.go} (78%)

diff --git a/go.mod b/go.mod
index 129255e8c76..a23c4762627 100644
--- a/go.mod
+++ b/go.mod
@@ -31,7 +31,7 @@ require (
 	github.com/pingcap/kvproto v0.0.0-20210805052247-76981389e818
 	github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7
 	github.com/pingcap/sysutil v0.0.0-20210730114356-fcd8a63f68c5
-	github.com/pingcap/tidb-dashboard v0.0.0-20210826074103-29034af68525
+	github.com/pingcap/tidb-dashboard v0.0.0-20211008050453-a25c25809529
 	github.com/prometheus/client_golang v1.1.0
 	github.com/prometheus/client_model v0.2.0 // indirect
 	github.com/prometheus/common v0.6.0
diff --git a/go.sum b/go.sum
index 2e3810b7402..b4db54ae744 100644
--- a/go.sum
+++ b/go.sum
@@ -327,8 +327,8 @@ github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuR
 github.com/pingcap/sysutil v0.0.0-20210315073920-cc0985d983a3/go.mod h1:tckvA041UWP+NqYzrJ3fMgC/Hw9wnmQ/tUkp/JaHly8=
 github.com/pingcap/sysutil v0.0.0-20210730114356-fcd8a63f68c5 h1:7rvAtZe/ZUzOKzgriNPQoBNvleJXBk4z7L3Z47+tS98=
 github.com/pingcap/sysutil v0.0.0-20210730114356-fcd8a63f68c5/go.mod h1:XsOaV712rUk63aOEKYP9PhXTIE3FMNHmC2r1wX5wElY=
-github.com/pingcap/tidb-dashboard v0.0.0-20210826074103-29034af68525 h1:hJDAzcVGfVsUx90/JHXPgvHggxsiuYZJ6FKkLY6Vo3M=
-github.com/pingcap/tidb-dashboard v0.0.0-20210826074103-29034af68525/go.mod h1:OCXbZTBTIMRcIt0jFsuCakZP+goYRv6IjawKbwLS2TQ=
+github.com/pingcap/tidb-dashboard v0.0.0-20211008050453-a25c25809529 h1:EVU+6dwm/kVb0kHsh+Wdgu2RGcpK1o9eUZI5QFzZPR4=
+github.com/pingcap/tidb-dashboard v0.0.0-20211008050453-a25c25809529/go.mod h1:OCXbZTBTIMRcIt0jFsuCakZP+goYRv6IjawKbwLS2TQ=
 github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
diff --git a/pkg/dashboard/distro/distro.go b/pkg/dashboard/distro/distro.go
index 3506206f00f..dff0ac47086 100644
--- a/pkg/dashboard/distro/distro.go
+++ b/pkg/dashboard/distro/distro.go
@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+//go:build dashboard_distro
+// +build dashboard_distro
+
 package distro
 
 import (
diff --git a/pkg/dashboard/distro/distro_info_placeholder.go b/pkg/dashboard/distro/placeholder.go
similarity index 78%
rename from pkg/dashboard/distro/distro_info_placeholder.go
rename to pkg/dashboard/distro/placeholder.go
index 76cd5b28062..3cf9b18b1e3 100644
--- a/pkg/dashboard/distro/distro_info_placeholder.go
+++ b/pkg/dashboard/distro/placeholder.go
@@ -11,11 +11,5 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// +build !dashboard_distro
 
 package distro
-
-import "github.com/pingcap/tidb-dashboard/pkg/utils/distro"
-
-// Resource declared the distro brand information
-var Resource = distro.Resource

From ce4abef7afc5e6c5ed5db2889789445354d70b25 Mon Sep 17 00:00:00 2001
From: Ti Chi Robot <ti-community-prow-bot@tidb.io>
Date: Wed, 13 Oct 2021 13:27:27 +0800
Subject: [PATCH 2/6] cluster: speed scheduler exit   (#4148) (#4199)

* This is an automated cherry-pick of #4148

Signed-off-by: ti-chi-bot <ti-community-prow-bot@tidb.io>

* resolv conflict

Signed-off-by: bufferflies <1045931706@qq.com>

Co-authored-by: buffer <1045931706@qq.com>
---
 server/cluster/cluster.go     | 2 +-
 server/cluster/coordinator.go | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go
index f769d752660..34453b8b5ba 100644
--- a/server/cluster/cluster.go
+++ b/server/cluster/cluster.go
@@ -383,8 +383,8 @@ func (c *RaftCluster) Stop() {
 	}
 
 	c.running = false
-	close(c.quit)
 	c.coordinator.stop()
+	close(c.quit)
 	c.Unlock()
 	c.wg.Wait()
 	log.Info("raftcluster is stopped")
diff --git a/server/cluster/coordinator.go b/server/cluster/coordinator.go
index 6bfd68a88d2..ecb5506ce9d 100644
--- a/server/cluster/coordinator.go
+++ b/server/cluster/coordinator.go
@@ -822,6 +822,12 @@ func (s *scheduleController) Stop() {
 
 func (s *scheduleController) Schedule() []*operator.Operator {
 	for i := 0; i < maxScheduleRetries; i++ {
+		// no need to retry if schedule should stop to speed exit
+		select {
+		case <-s.ctx.Done():
+			return nil
+		default:
+		}
 		// If we have schedule, reset interval to the minimal interval.
 		if op := s.Scheduler.Schedule(s.cluster); op != nil {
 			s.nextInterval = s.Scheduler.GetMinInterval()

From 30796dbeed0324f9d28505fd2a3738d9035fa3ff Mon Sep 17 00:00:00 2001
From: Ti Chi Robot <ti-community-prow-bot@tidb.io>
Date: Wed, 13 Oct 2021 13:37:27 +0800
Subject: [PATCH 3/6] checker: judging that the peer is down is no longer based
 on DownSeconds (#4078) (#4084)

* checker: judging that the peer is down is no longer based on DownSeconds

Signed-off-by: HunDunDM <hundundm@gmail.com>

* address comment

Signed-off-by: HunDunDM <hundundm@gmail.com>

Co-authored-by: HunDunDM <hundundm@gmail.com>
---
 server/schedule/checker/replica_checker.go | 5 +----
 server/schedule/checker/rule_checker.go    | 4 +---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/server/schedule/checker/replica_checker.go b/server/schedule/checker/replica_checker.go
index 4f93058d905..5edf8a2d6cc 100644
--- a/server/schedule/checker/replica_checker.go
+++ b/server/schedule/checker/replica_checker.go
@@ -106,13 +106,10 @@ func (r *ReplicaChecker) checkDownPeer(region *core.RegionInfo) *operator.Operat
 			log.Warn("lost the store, maybe you are recovering the PD cluster", zap.Uint64("store-id", storeID))
 			return nil
 		}
+		// Only consider the state of the Store, not `stats.DownSeconds`.
 		if store.DownTime() < r.opts.GetMaxStoreDownTime() {
 			continue
 		}
-		if stats.GetDownSeconds() < uint64(r.opts.GetMaxStoreDownTime().Seconds()) {
-			continue
-		}
-
 		return r.fixPeer(region, storeID, downStatus)
 	}
 	return nil
diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go
index b7e8ec2a2d6..8287c0aca54 100644
--- a/server/schedule/checker/rule_checker.go
+++ b/server/schedule/checker/rule_checker.go
@@ -298,12 +298,10 @@ func (c *RuleChecker) isDownPeer(region *core.RegionInfo, peer *metapb.Peer) boo
 			log.Warn("lost the store, maybe you are recovering the PD cluster", zap.Uint64("store-id", storeID))
 			return false
 		}
+		// Only consider the state of the Store, not `stats.DownSeconds`.
 		if store.DownTime() < c.cluster.GetOpts().GetMaxStoreDownTime() {
 			continue
 		}
-		if stats.GetDownSeconds() < uint64(c.cluster.GetOpts().GetMaxStoreDownTime().Seconds()) {
-			continue
-		}
 		return true
 	}
 	return false

From 24d138319797b602848b0d387dcb91c736770d44 Mon Sep 17 00:00:00 2001
From: Ti Chi Robot <ti-community-prow-bot@tidb.io>
Date: Wed, 13 Oct 2021 13:47:27 +0800
Subject: [PATCH 4/6] *: fix race in hot region config (#4167) (#4170)

* fix race in hot region config

Signed-off-by: Ryan Leung <rleungx@gmail.com>

* address the comment

Signed-off-by: Ryan Leung <rleungx@gmail.com>

Co-authored-by: Ryan Leung <rleungx@gmail.com>
---
 server/schedulers/hot_region_config.go | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/server/schedulers/hot_region_config.go b/server/schedulers/hot_region_config.go
index 08dbe5fa947..a0daacb0bb1 100644
--- a/server/schedulers/hot_region_config.go
+++ b/server/schedulers/hot_region_config.go
@@ -242,8 +242,8 @@ func (conf *hotRegionSchedulerConfig) GetEnableForTiFlash() bool {
 }
 
 func (conf *hotRegionSchedulerConfig) SetEnableForTiFlash(enable bool) {
-	conf.RLock()
-	defer conf.RUnlock()
+	conf.Lock()
+	defer conf.Unlock()
 	conf.EnableForTiFlash = enable
 }
 
@@ -309,7 +309,7 @@ func (conf *hotRegionSchedulerConfig) handleSetConfig(w http.ResponseWriter, r *
 	}
 	newc, _ := json.Marshal(conf)
 	if !bytes.Equal(oldc, newc) {
-		conf.persist()
+		conf.persistLocked()
 		rd.Text(w, http.StatusOK, "success")
 	}
 
@@ -333,7 +333,7 @@ func (conf *hotRegionSchedulerConfig) handleSetConfig(w http.ResponseWriter, r *
 	rd.Text(w, http.StatusBadRequest, "config item not found")
 }
 
-func (conf *hotRegionSchedulerConfig) persist() error {
+func (conf *hotRegionSchedulerConfig) persistLocked() error {
 	data, err := schedule.EncodeConfig(conf)
 	if err != nil {
 		return err
@@ -343,6 +343,8 @@ func (conf *hotRegionSchedulerConfig) persist() error {
 
 func (conf *hotRegionSchedulerConfig) checkQuerySupport(cluster opt.Cluster) bool {
 	querySupport := cluster.IsFeatureSupported(versioninfo.HotScheduleWithQuery)
+	conf.Lock()
+	defer conf.Unlock()
 	if querySupport != conf.lastQuerySupported {
 		log.Info("query supported changed",
 			zap.Bool("last-query-support", conf.lastQuerySupported),

From 6f3ff3f8f0d2e57e15ff5869acfa343ae23a6d9a Mon Sep 17 00:00:00 2001
From: Ti Chi Robot <ti-community-prow-bot@tidb.io>
Date: Wed, 13 Oct 2021 14:33:27 +0800
Subject: [PATCH 5/6] scheduler: allow empty region to be scheduled and use a
 sperate tolerance config in scatter range scheduler (#4106) (#4118)

* scheduler: allow empty region to be scheduled and use a sperate tolerance config in scatter range scheduler

Signed-off-by: lhy1024 <admin@liudos.us>

* fix lint

Signed-off-by: lhy1024 <admin@liudos.us>

* address comments

Signed-off-by: lhy1024 <admin@liudos.us>

* fix lint

Signed-off-by: lhy1024 <admin@liudos.us>

Co-authored-by: lhy1024 <admin@liudos.us>
---
 server/schedulers/balance_region.go | 19 +++++++++++++++----
 server/schedulers/balance_test.go   | 27 +++++++++++++--------------
 server/schedulers/utils.go          | 10 +++++++++-
 3 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/server/schedulers/balance_region.go b/server/schedulers/balance_region.go
index 27e039f8b21..f1ad5b38b56 100644
--- a/server/schedulers/balance_region.go
+++ b/server/schedulers/balance_region.go
@@ -149,23 +149,34 @@ func (s *balanceRegionScheduler) Schedule(cluster opt.Cluster) []*operator.Opera
 		return stores[i].RegionScore(opts.GetRegionScoreFormulaVersion(), opts.GetHighSpaceRatio(), opts.GetLowSpaceRatio(), iOp) >
 			stores[j].RegionScore(opts.GetRegionScoreFormulaVersion(), opts.GetHighSpaceRatio(), opts.GetLowSpaceRatio(), jOp)
 	})
+
+	var allowBalanceEmptyRegion func(*core.RegionInfo) bool
+
+	switch cluster.(type) {
+	case *schedule.RangeCluster:
+		// allow empty region to be scheduled in range cluster
+		allowBalanceEmptyRegion = func(region *core.RegionInfo) bool { return true }
+	default:
+		allowBalanceEmptyRegion = opt.AllowBalanceEmptyRegion(cluster)
+	}
+
 	for _, plan.source = range stores {
 		for i := 0; i < balanceRegionRetryLimit; i++ {
 			schedulerCounter.WithLabelValues(s.GetName(), "total").Inc()
 			// Priority pick the region that has a pending peer.
 			// Pending region may means the disk is overload, remove the pending region firstly.
-			plan.region = cluster.RandPendingRegion(plan.SourceStoreID(), s.conf.Ranges, opt.HealthAllowPending(cluster), opt.ReplicatedRegion(cluster), opt.AllowBalanceEmptyRegion(cluster))
+			plan.region = cluster.RandPendingRegion(plan.SourceStoreID(), s.conf.Ranges, opt.HealthAllowPending(cluster), opt.ReplicatedRegion(cluster), allowBalanceEmptyRegion)
 			if plan.region == nil {
 				// Then pick the region that has a follower in the source store.
-				plan.region = cluster.RandFollowerRegion(plan.SourceStoreID(), s.conf.Ranges, opt.HealthRegion(cluster), opt.ReplicatedRegion(cluster), opt.AllowBalanceEmptyRegion(cluster))
+				plan.region = cluster.RandFollowerRegion(plan.SourceStoreID(), s.conf.Ranges, opt.HealthRegion(cluster), opt.ReplicatedRegion(cluster), allowBalanceEmptyRegion)
 			}
 			if plan.region == nil {
 				// Then pick the region has the leader in the source store.
-				plan.region = cluster.RandLeaderRegion(plan.SourceStoreID(), s.conf.Ranges, opt.HealthRegion(cluster), opt.ReplicatedRegion(cluster), opt.AllowBalanceEmptyRegion(cluster))
+				plan.region = cluster.RandLeaderRegion(plan.SourceStoreID(), s.conf.Ranges, opt.HealthRegion(cluster), opt.ReplicatedRegion(cluster), allowBalanceEmptyRegion)
 			}
 			if plan.region == nil {
 				// Finally pick learner.
-				plan.region = cluster.RandLearnerRegion(plan.SourceStoreID(), s.conf.Ranges, opt.HealthRegion(cluster), opt.ReplicatedRegion(cluster), opt.AllowBalanceEmptyRegion(cluster))
+				plan.region = cluster.RandLearnerRegion(plan.SourceStoreID(), s.conf.Ranges, opt.HealthRegion(cluster), opt.ReplicatedRegion(cluster), allowBalanceEmptyRegion)
 			}
 			if plan.region == nil {
 				schedulerCounter.WithLabelValues(s.GetName(), "no-region").Inc()
diff --git a/server/schedulers/balance_test.go b/server/schedulers/balance_test.go
index 62f1b93ccc9..3dcb5153ddb 100644
--- a/server/schedulers/balance_test.go
+++ b/server/schedulers/balance_test.go
@@ -1053,28 +1053,29 @@ func (s *testRandomMergeSchedulerSuite) TestMerge(c *C) {
 	c.Assert(mb.IsScheduleAllowed(tc), IsFalse)
 }
 
-var _ = Suite(&testScatterRangeLeaderSuite{})
+var _ = Suite(&testScatterRangeSuite{})
 
-type testScatterRangeLeaderSuite struct {
+type testScatterRangeSuite struct {
 	ctx    context.Context
 	cancel context.CancelFunc
 }
 
-func (s *testScatterRangeLeaderSuite) SetUpSuite(c *C) {
+func (s *testScatterRangeSuite) SetUpSuite(c *C) {
 	s.ctx, s.cancel = context.WithCancel(context.Background())
 }
 
-func (s *testScatterRangeLeaderSuite) TearDownSuite(c *C) {
+func (s *testScatterRangeSuite) TearDownSuite(c *C) {
 	s.cancel()
 }
 
-func (s *testScatterRangeLeaderSuite) TestBalance(c *C) {
+func (s *testScatterRangeSuite) TestBalance(c *C) {
 	opt := config.NewTestOptions()
 	// TODO: enable palcementrules
 	opt.SetPlacementRuleEnabled(false)
 	tc := mockcluster.NewCluster(s.ctx, opt)
 	tc.DisableFeature(versioninfo.JointConsensus)
-	tc.SetTolerantSizeRatio(2.5)
+	// range cluster use a special tolerant ratio, cluster opt take no impact
+	tc.SetTolerantSizeRatio(10000)
 	// Add stores 1,2,3,4,5.
 	tc.AddRegionStore(1, 0)
 	tc.AddRegionStore(2, 0)
@@ -1099,17 +1100,16 @@ func (s *testScatterRangeLeaderSuite) TestBalance(c *C) {
 		})
 		id += 4
 	}
-	// empty case
+	// empty region case
 	regions[49].EndKey = []byte("")
 	for _, meta := range regions {
 		leader := rand.Intn(4) % 3
 		regionInfo := core.NewRegionInfo(
 			meta,
 			meta.Peers[leader],
-			core.SetApproximateKeys(96),
-			core.SetApproximateSize(96),
+			core.SetApproximateKeys(1),
+			core.SetApproximateSize(1),
 		)
-
 		tc.Regions.SetRegion(regionInfo)
 	}
 	for i := 0; i < 100; i++ {
@@ -1133,7 +1133,7 @@ func (s *testScatterRangeLeaderSuite) TestBalance(c *C) {
 	}
 }
 
-func (s *testScatterRangeLeaderSuite) TestBalanceLeaderLimit(c *C) {
+func (s *testScatterRangeSuite) TestBalanceLeaderLimit(c *C) {
 	opt := config.NewTestOptions()
 	opt.SetPlacementRuleEnabled(false)
 	tc := mockcluster.NewCluster(s.ctx, opt)
@@ -1164,7 +1164,6 @@ func (s *testScatterRangeLeaderSuite) TestBalanceLeaderLimit(c *C) {
 		id += 4
 	}
 
-	// empty case
 	regions[49].EndKey = []byte("")
 	for _, meta := range regions {
 		leader := rand.Intn(4) % 3
@@ -1209,7 +1208,7 @@ func (s *testScatterRangeLeaderSuite) TestBalanceLeaderLimit(c *C) {
 	c.Check(maxLeaderCount-minLeaderCount, Greater, 10)
 }
 
-func (s *testScatterRangeLeaderSuite) TestConcurrencyUpdateConfig(c *C) {
+func (s *testScatterRangeSuite) TestConcurrencyUpdateConfig(c *C) {
 	opt := config.NewTestOptions()
 	tc := mockcluster.NewCluster(s.ctx, opt)
 	oc := schedule.NewOperatorController(s.ctx, nil, nil)
@@ -1235,7 +1234,7 @@ func (s *testScatterRangeLeaderSuite) TestConcurrencyUpdateConfig(c *C) {
 	ch <- struct{}{}
 }
 
-func (s *testScatterRangeLeaderSuite) TestBalanceWhenRegionNotHeartbeat(c *C) {
+func (s *testScatterRangeSuite) TestBalanceWhenRegionNotHeartbeat(c *C) {
 	opt := config.NewTestOptions()
 	tc := mockcluster.NewCluster(s.ctx, opt)
 	// Add stores 1,2,3.
diff --git a/server/schedulers/utils.go b/server/schedulers/utils.go
index 65e80bb45ca..458d733658b 100644
--- a/server/schedulers/utils.go
+++ b/server/schedulers/utils.go
@@ -25,6 +25,7 @@ import (
 	"github.com/tikv/pd/pkg/errs"
 	"github.com/tikv/pd/pkg/typeutil"
 	"github.com/tikv/pd/server/core"
+	"github.com/tikv/pd/server/schedule"
 	"github.com/tikv/pd/server/schedule/operator"
 	"github.com/tikv/pd/server/schedule/opt"
 	"github.com/tikv/pd/server/statistics"
@@ -148,7 +149,14 @@ func (p *balancePlan) getTolerantResource() int64 {
 }
 
 func adjustTolerantRatio(cluster opt.Cluster, kind core.ScheduleKind) float64 {
-	tolerantSizeRatio := cluster.GetOpts().GetTolerantSizeRatio()
+	var tolerantSizeRatio float64
+	switch c := cluster.(type) {
+	case *schedule.RangeCluster:
+		// range cluster use a separate configuration
+		tolerantSizeRatio = c.GetTolerantSizeRatio()
+	default:
+		tolerantSizeRatio = cluster.GetOpts().GetTolerantSizeRatio()
+	}
 	if kind.Resource == core.LeaderKind && kind.Policy == core.ByCount {
 		if tolerantSizeRatio == 0 {
 			return leaderTolerantSizeRatio

From 46534a1efe0acd4caeeec74a35cf514a3a71f5b1 Mon Sep 17 00:00:00 2001
From: Ti Chi Robot <ti-community-prow-bot@tidb.io>
Date: Wed, 13 Oct 2021 14:45:27 +0800
Subject: [PATCH 6/6] scheduler: dynamically adjust the retry limit according
 to the operator (#4007) (#4046)

* scheduler: dynamically adjust the retry limit according to the operator

Signed-off-by: HunDunDM <hundundm@gmail.com>

* address comment

Signed-off-by: HunDunDM <hundundm@gmail.com>

* fix license

Signed-off-by: HunDunDM <hundundm@gmail.com>

Co-authored-by: HunDunDM <hundundm@gmail.com>
---
 server/schedulers/balance_leader.go | 16 ++++++--
 server/schedulers/balance_region.go |  8 +++-
 server/schedulers/utils.go          | 60 +++++++++++++++++++++++++++--
 server/schedulers/utils_test.go     | 53 +++++++++++++++++++++++++
 4 files changed, 128 insertions(+), 9 deletions(-)
 create mode 100644 server/schedulers/utils_test.go

diff --git a/server/schedulers/balance_leader.go b/server/schedulers/balance_leader.go
index 6b80ecf6fb9..7efc77d8779 100644
--- a/server/schedulers/balance_leader.go
+++ b/server/schedulers/balance_leader.go
@@ -70,6 +70,7 @@ type balanceLeaderSchedulerConfig struct {
 
 type balanceLeaderScheduler struct {
 	*BaseScheduler
+	*retryQuota
 	conf         *balanceLeaderSchedulerConfig
 	opController *schedule.OperatorController
 	filters      []filter.Filter
@@ -83,6 +84,7 @@ func newBalanceLeaderScheduler(opController *schedule.OperatorController, conf *
 
 	s := &balanceLeaderScheduler{
 		BaseScheduler: base,
+		retryQuota:    newRetryQuota(balanceLeaderRetryLimit, defaultMinRetryLimit, defaultRetryQuotaAttenuation),
 		conf:          conf,
 		opController:  opController,
 		counter:       balanceLeaderCounter,
@@ -153,7 +155,7 @@ func (l *balanceLeaderScheduler) Schedule(cluster opt.Cluster) []*operator.Opera
 	})
 	sort.Slice(targets, func(i, j int) bool {
 		iOp := plan.GetOpInfluence(targets[i].GetID())
-		jOp := plan.GetOpInfluence(targets[i].GetID())
+		jOp := plan.GetOpInfluence(targets[j].GetID())
 		return targets[i].LeaderScore(leaderSchedulePolicy, iOp) <
 			targets[j].LeaderScore(leaderSchedulePolicy, jOp)
 	})
@@ -161,32 +163,38 @@ func (l *balanceLeaderScheduler) Schedule(cluster opt.Cluster) []*operator.Opera
 	for i := 0; i < len(sources) || i < len(targets); i++ {
 		if i < len(sources) {
 			plan.source, plan.target = sources[i], nil
+			retryLimit := l.retryQuota.GetLimit(plan.source)
 			log.Debug("store leader score", zap.String("scheduler", l.GetName()), zap.Uint64("source-store", plan.SourceStoreID()))
 			l.counter.WithLabelValues("high-score", plan.SourceMetricLabel()).Inc()
-			for j := 0; j < balanceLeaderRetryLimit; j++ {
+			for j := 0; j < retryLimit; j++ {
 				schedulerCounter.WithLabelValues(l.GetName(), "total").Inc()
 				if ops := l.transferLeaderOut(plan); len(ops) > 0 {
+					l.retryQuota.ResetLimit(plan.source)
 					ops[0].Counters = append(ops[0].Counters, l.counter.WithLabelValues("transfer-out", plan.SourceMetricLabel()))
 					return ops
 				}
 			}
+			l.Attenuate(plan.source)
 			log.Debug("no operator created for selected stores", zap.String("scheduler", l.GetName()), zap.Uint64("source", plan.SourceStoreID()))
 		}
 		if i < len(targets) {
 			plan.source, plan.target = nil, targets[i]
+			retryLimit := l.retryQuota.GetLimit(plan.target)
 			log.Debug("store leader score", zap.String("scheduler", l.GetName()), zap.Uint64("target-store", plan.TargetStoreID()))
 			l.counter.WithLabelValues("low-score", plan.TargetMetricLabel()).Inc()
-
-			for j := 0; j < balanceLeaderRetryLimit; j++ {
+			for j := 0; j < retryLimit; j++ {
 				schedulerCounter.WithLabelValues(l.GetName(), "total").Inc()
 				if ops := l.transferLeaderIn(plan); len(ops) > 0 {
+					l.retryQuota.ResetLimit(plan.target)
 					ops[0].Counters = append(ops[0].Counters, l.counter.WithLabelValues("transfer-in", plan.TargetMetricLabel()))
 					return ops
 				}
 			}
+			l.Attenuate(plan.target)
 			log.Debug("no operator created for selected stores", zap.String("scheduler", l.GetName()), zap.Uint64("target", plan.TargetStoreID()))
 		}
 	}
+	l.retryQuota.GC(append(sources, targets...))
 	return nil
 }
 
diff --git a/server/schedulers/balance_region.go b/server/schedulers/balance_region.go
index f1ad5b38b56..43e23ef52b4 100644
--- a/server/schedulers/balance_region.go
+++ b/server/schedulers/balance_region.go
@@ -70,6 +70,7 @@ type balanceRegionSchedulerConfig struct {
 
 type balanceRegionScheduler struct {
 	*BaseScheduler
+	*retryQuota
 	conf         *balanceRegionSchedulerConfig
 	opController *schedule.OperatorController
 	filters      []filter.Filter
@@ -82,6 +83,7 @@ func newBalanceRegionScheduler(opController *schedule.OperatorController, conf *
 	base := NewBaseScheduler(opController)
 	scheduler := &balanceRegionScheduler{
 		BaseScheduler: base,
+		retryQuota:    newRetryQuota(balanceRegionRetryLimit, defaultMinRetryLimit, defaultRetryQuotaAttenuation),
 		conf:          conf,
 		opController:  opController,
 		counter:       balanceRegionCounter,
@@ -161,7 +163,8 @@ func (s *balanceRegionScheduler) Schedule(cluster opt.Cluster) []*operator.Opera
 	}
 
 	for _, plan.source = range stores {
-		for i := 0; i < balanceRegionRetryLimit; i++ {
+		retryLimit := s.retryQuota.GetLimit(plan.source)
+		for i := 0; i < retryLimit; i++ {
 			schedulerCounter.WithLabelValues(s.GetName(), "total").Inc()
 			// Priority pick the region that has a pending peer.
 			// Pending region may means the disk is overload, remove the pending region firstly.
@@ -198,11 +201,14 @@ func (s *balanceRegionScheduler) Schedule(cluster opt.Cluster) []*operator.Opera
 			}
 
 			if op := s.transferPeer(plan); op != nil {
+				s.retryQuota.ResetLimit(plan.source)
 				op.Counters = append(op.Counters, schedulerCounter.WithLabelValues(s.GetName(), "new-operator"))
 				return []*operator.Operator{op}
 			}
 		}
+		s.retryQuota.Attenuate(plan.source)
 	}
+	s.retryQuota.GC(stores)
 	return nil
 }
 
diff --git a/server/schedulers/utils.go b/server/schedulers/utils.go
index 458d733658b..688ef0a52a2 100644
--- a/server/schedulers/utils.go
+++ b/server/schedulers/utils.go
@@ -34,10 +34,12 @@ import (
 
 const (
 	// adjustRatio is used to adjust TolerantSizeRatio according to region count.
-	adjustRatio             float64 = 0.005
-	leaderTolerantSizeRatio float64 = 5.0
-	minTolerantSizeRatio    float64 = 1.0
-	influenceAmp            int64   = 100
+	adjustRatio                  float64 = 0.005
+	leaderTolerantSizeRatio      float64 = 5.0
+	minTolerantSizeRatio         float64 = 1.0
+	influenceAmp                 int64   = 100
+	defaultMinRetryLimit                 = 1
+	defaultRetryQuotaAttenuation         = 2
 )
 
 type balancePlan struct {
@@ -753,3 +755,53 @@ func filterHotPeers(kind core.ResourceKind, peers []*statistics.HotPeerStat) []*
 	}
 	return ret
 }
+
+type retryQuota struct {
+	initialLimit int
+	minLimit     int
+	attenuation  int
+
+	limits map[uint64]int
+}
+
+func newRetryQuota(initialLimit, minLimit, attenuation int) *retryQuota {
+	return &retryQuota{
+		initialLimit: initialLimit,
+		minLimit:     minLimit,
+		attenuation:  attenuation,
+		limits:       make(map[uint64]int),
+	}
+}
+
+func (q *retryQuota) GetLimit(store *core.StoreInfo) int {
+	id := store.GetID()
+	if limit, ok := q.limits[id]; ok {
+		return limit
+	}
+	q.limits[id] = q.initialLimit
+	return q.initialLimit
+}
+
+func (q *retryQuota) ResetLimit(store *core.StoreInfo) {
+	q.limits[store.GetID()] = q.initialLimit
+}
+
+func (q *retryQuota) Attenuate(store *core.StoreInfo) {
+	newLimit := q.GetLimit(store) / q.attenuation
+	if newLimit < q.minLimit {
+		newLimit = q.minLimit
+	}
+	q.limits[store.GetID()] = newLimit
+}
+
+func (q *retryQuota) GC(keepStores []*core.StoreInfo) {
+	set := make(map[uint64]struct{}, len(keepStores))
+	for _, store := range keepStores {
+		set[store.GetID()] = struct{}{}
+	}
+	for id := range q.limits {
+		if _, ok := set[id]; !ok {
+			delete(q.limits, id)
+		}
+	}
+}
diff --git a/server/schedulers/utils_test.go b/server/schedulers/utils_test.go
new file mode 100644
index 00000000000..13620ebd114
--- /dev/null
+++ b/server/schedulers/utils_test.go
@@ -0,0 +1,53 @@
+// Copyright 2021 TiKV Project Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package schedulers
+
+import (
+	. "github.com/pingcap/check"
+	"github.com/pingcap/kvproto/pkg/metapb"
+	"github.com/tikv/pd/server/core"
+)
+
+var _ = Suite(&testUtilsSuite{})
+
+type testUtilsSuite struct{}
+
+func (s *testUtilsSuite) TestRetryQuota(c *C) {
+	q := newRetryQuota(10, 1, 2)
+	store1 := core.NewStoreInfo(&metapb.Store{Id: 1})
+	store2 := core.NewStoreInfo(&metapb.Store{Id: 2})
+	keepStores := []*core.StoreInfo{store1}
+
+	// test GetLimit
+	c.Assert(q.GetLimit(store1), Equals, 10)
+
+	// test Attenuate
+	for _, expected := range []int{5, 2, 1, 1, 1} {
+		q.Attenuate(store1)
+		c.Assert(q.GetLimit(store1), Equals, expected)
+	}
+
+	// test GC
+	c.Assert(q.GetLimit(store2), Equals, 10)
+	q.Attenuate(store2)
+	c.Assert(q.GetLimit(store2), Equals, 5)
+	q.GC(keepStores)
+	c.Assert(q.GetLimit(store1), Equals, 1)
+	c.Assert(q.GetLimit(store2), Equals, 10)
+
+	// test ResetLimit
+	q.ResetLimit(store1)
+	c.Assert(q.GetLimit(store1), Equals, 10)
+}