From 31aea9bfd1e15c769dabed5614622bd4eb8792ed Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 14 Sep 2021 00:46:41 +0800 Subject: [PATCH] placement: do not delete orphan peers if some peers selected by RuleFit is down or pending (#4067) (#4074) * This is an automated cherry-pick of #4067 Signed-off-by: ti-chi-bot * fix Signed-off-by: HunDunDM Co-authored-by: Andy Lok Co-authored-by: HunDunDM --- server/schedule/checker/rule_checker.go | 17 +++++++- server/schedule/checker/rule_checker_test.go | 42 ++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index 6ce4e8de449..9e23d48b809 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -269,12 +269,27 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg if len(fit.OrphanPeers) == 0 { return nil, nil } - // remove orphan peers only when all rules are satisfied (count+role) + // remove orphan peers only when all rules are satisfied (count+role) and all peers selected + // by RuleFits is not pending or down. for _, rf := range fit.RuleFits { if !rf.IsSatisfied() { checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc() return nil, nil } + for _, p := range rf.Peers { + for _, pendingPeer := range region.GetPendingPeers() { + if pendingPeer.Id == p.Id { + checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc() + return nil, nil + } + } + for _, downPeer := range region.GetDownPeers() { + if downPeer.Peer.Id == p.Id { + checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc() + return nil, nil + } + } + } } checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc() peer := fit.OrphanPeers[0] diff --git a/server/schedule/checker/rule_checker_test.go b/server/schedule/checker/rule_checker_test.go index 9893508fb76..a433bb71b35 100644 --- a/server/schedule/checker/rule_checker_test.go +++ b/server/schedule/checker/rule_checker_test.go @@ -600,3 +600,45 @@ func (s *testRuleCheckerSuite) TestFixOfflinePeer(c *C) { s.ruleManager.SetRule(rule) c.Assert(s.rc.Check(region), IsNil) } + +// Ref https://github.com/tikv/pd/issues/4045 +func (s *testRuleCheckerSuite) TestSkipFixOrphanPeerIfSelectedPeerisPendingOrDown(c *C) { + s.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + s.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host1"}) + s.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host2"}) + s.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) + s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2, 3, 4) + + // set peer3 and peer4 to pending + r1 := s.cluster.GetRegion(1) + r1 = r1.Clone(core.WithPendingPeers([]*metapb.Peer{r1.GetStorePeer(3), r1.GetStorePeer(4)})) + s.cluster.PutRegion(r1) + + // should not remove extra peer + op := s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, IsNil) + + // set peer3 to down-peer + r1 = r1.Clone(core.WithPendingPeers([]*metapb.Peer{r1.GetStorePeer(4)})) + r1 = r1.Clone(core.WithDownPeers([]*pdpb.PeerStats{ + { + Peer: r1.GetStorePeer(3), + DownSeconds: 42, + }, + })) + s.cluster.PutRegion(r1) + + // should not remove extra peer + op = s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, IsNil) + + // set peer3 to normal + r1 = r1.Clone(core.WithDownPeers(nil)) + s.cluster.PutRegion(r1) + + // should remove extra peer now + var remove operator.RemovePeer + op = s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op.Step(0), FitsTypeOf, remove) + c.Assert(op.Desc(), Equals, "remove-orphan-peer") +}