From a9633ae30f41754293d6af008ce174a5b3da89e1 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Fri, 3 Nov 2023 10:09:39 +0800 Subject: [PATCH 1/2] checker: replace down check with disconnect check when fixing orphan peer (#7294) close tikv/pd#7249 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> Signed-off-by: lhy1024 --- server/core/store.go | 3 + server/schedule/checker/rule_checker.go | 60 ++++--- server/schedule/checker/rule_checker_test.go | 176 ++++++++++++++++++- 3 files changed, 214 insertions(+), 25 deletions(-) diff --git a/server/core/store.go b/server/core/store.go index 1556d6d0955..247b1e828b3 100644 --- a/server/core/store.go +++ b/server/core/store.go @@ -496,6 +496,9 @@ var ( // tikv's store heartbeat for a short time, maybe caused by process restart or // temporary network failure. func (s *StoreInfo) IsDisconnected() bool { + if s == nil { + return true + } return s.DownTime() > storeDisconnectDuration } diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index 1898012c19d..da707488ea3 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -390,7 +390,7 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg if len(fit.OrphanPeers) == 0 { return nil, nil } - var pinDownPeer *metapb.Peer + isUnhealthyPeer := func(id uint64) bool { for _, downPeer := range region.GetDownPeers() { if downPeer.Peer.GetId() == id { @@ -404,31 +404,41 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg } return false } + + isDisconnectedPeer := func(p *metapb.Peer) bool { + // avoid to meet down store when fix orphan peers, + // Isdisconnected is more strictly than IsUnhealthy. + return c.cluster.GetStore(p.GetStoreId()).IsDisconnected() + } + + checkDownPeer := func(peers []*metapb.Peer) (*metapb.Peer, bool) { + for _, p := range peers { + if isUnhealthyPeer(p.GetId()) { + // make sure is down peer. + if region.GetDownPeer(p.GetId()) != nil { + return p, true + } + return nil, true + } + if isDisconnectedPeer(p) { + return p, true + } + } + return nil, false + } + // remove orphan peers only when all rules are satisfied (count+role) and all peers selected // by RuleFits is not pending or down. + var pinDownPeer *metapb.Peer hasUnhealthyFit := false -loopFits: for _, rf := range fit.RuleFits { if !rf.IsSatisfied() { hasUnhealthyFit = true break } - for _, p := range rf.Peers { - if isUnhealthyPeer(p.GetId()) { - // make sure is down peer. - if region.GetDownPeer(p.GetId()) != nil { - pinDownPeer = p - } - hasUnhealthyFit = true - break loopFits - } - // avoid to meet down store when fix orpahn peers, - // Isdisconnected is more strictly than IsUnhealthy. - if c.cluster.GetStore(p.GetStoreId()).IsDisconnected() { - hasUnhealthyFit = true - pinDownPeer = p - break loopFits - } + pinDownPeer, hasUnhealthyFit = checkDownPeer(rf.Peers) + if hasUnhealthyFit { + break } } @@ -445,15 +455,15 @@ loopFits: continue } // make sure the orphan peer is healthy. - if isUnhealthyPeer(orphanPeer.GetId()) { + if isUnhealthyPeer(orphanPeer.GetId()) || isDisconnectedPeer(orphanPeer) { continue } // no consider witness in this path. if pinDownPeer.GetIsWitness() || orphanPeer.GetIsWitness() { continue } - // down peer's store should be down. - if !c.isStoreDownTimeHitMaxDownTime(pinDownPeer.GetStoreId()) { + // down peer's store should be disconnected + if !isDisconnectedPeer(pinDownPeer) { continue } // check if down peer can replace with orphan peer. @@ -468,7 +478,7 @@ loopFits: case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Learner: return operator.CreateDemoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer) case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Voter && - c.cluster.GetStore(pinDownPeer.GetStoreId()).IsDisconnected() && !dstStore.IsDisconnected(): + isDisconnectedPeer(pinDownPeer) && !dstStore.IsDisconnected(): return operator.CreateRemovePeerOperator("remove-replaced-orphan-peer", c.cluster, 0, region, pinDownPeer.GetStoreId()) default: // destRole should not same with orphanPeerRole. if role is same, it fit with orphanPeer should be better than now. @@ -485,7 +495,11 @@ loopFits: for _, orphanPeer := range fit.OrphanPeers { if isUnhealthyPeer(orphanPeer.GetId()) { checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc() - return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) + return operator.CreateRemovePeerOperator("remove-unhealthy-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) + } + if isDisconnectedPeer(orphanPeer) { + checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc() + return operator.CreateRemovePeerOperator("remove-disconnected-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } if hasHealthPeer { // there already exists a healthy orphan peer, so we can remove other orphan Peers. diff --git a/server/schedule/checker/rule_checker_test.go b/server/schedule/checker/rule_checker_test.go index 4230cef537a..58869ac823c 100644 --- a/server/schedule/checker/rule_checker_test.go +++ b/server/schedule/checker/rule_checker_test.go @@ -196,7 +196,7 @@ func (suite *ruleCheckerTestSuite) TestFixToManyOrphanPeers() { suite.cluster.PutRegion(region) op = suite.rc.Check(suite.cluster.GetRegion(1)) suite.NotNil(op) - suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal("remove-unhealthy-orphan-peer", op.Desc()) suite.Equal(uint64(4), op.Step(0).(operator.RemovePeer).FromStore) } @@ -663,7 +663,7 @@ func (suite *ruleCheckerTestSuite) TestPriorityFixOrphanPeer() { suite.cluster.PutRegion(testRegion) op = suite.rc.Check(suite.cluster.GetRegion(1)) suite.NotNil(op) - suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal("remove-unhealthy-orphan-peer", op.Desc()) suite.IsType(remove, op.Step(0)) // Ref #3521 suite.cluster.SetStoreOffline(2) @@ -684,6 +684,178 @@ func (suite *ruleCheckerTestSuite) TestPriorityFixOrphanPeer() { suite.Equal("remove-orphan-peer", op.Desc()) } +// Ref https://github.com/tikv/pd/issues/7249 https://github.com/tikv/tikv/issues/15799 +func (suite *ruleCheckerTestSuite) TestFixOrphanPeerWithDisconnectedStoreAndRuleChanged() { + // init cluster with 5 replicas + suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) + storeIDs := []uint64{1, 2, 3, 4, 5} + suite.cluster.AddLeaderRegionWithRange(1, "", "", storeIDs[0], storeIDs[1:]...) + rule := &placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 5, + StartKey: []byte{}, + EndKey: []byte{}, + } + suite.ruleManager.SetRule(rule) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) + + // set store 1, 2 to disconnected + suite.cluster.SetStoreDisconnect(1) + suite.cluster.SetStoreDisconnect(2) + + // change rule to 3 replicas + rule = &placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 3, + StartKey: []byte{}, + EndKey: []byte{}, + Override: true, + } + suite.ruleManager.SetRule(rule) + + // remove store 1 from region 1 + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-replaced-orphan-peer", op.Desc()) + suite.Equal(op.Len(), 2) + newLeaderID := op.Step(0).(operator.TransferLeader).ToStore + removedPeerID := op.Step(1).(operator.RemovePeer).FromStore + r1 := suite.cluster.GetRegion(1) + r1 = r1.Clone( + core.WithLeader(r1.GetPeer(newLeaderID)), + core.WithRemoveStorePeer(removedPeerID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 4) + + // remove store 2 from region 1 + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-replaced-orphan-peer", op.Desc()) + suite.Equal(op.Len(), 1) + removedPeerID = op.Step(0).(operator.RemovePeer).FromStore + r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 3) + for _, p := range r1.GetPeers() { + suite.NotEqual(p.GetStoreId(), 1) + suite.NotEqual(p.GetStoreId(), 2) + } +} + +// Ref https://github.com/tikv/pd/issues/7249 https://github.com/tikv/tikv/issues/15799 +func (suite *ruleCheckerTestSuite) TestFixOrphanPeerWithDisconnectedStoreAndRuleChanged2() { + // init cluster with 5 voters and 1 learner + suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) + suite.cluster.AddLabelsStore(6, 1, map[string]string{"host": "host6"}) + storeIDs := []uint64{1, 2, 3, 4, 5} + suite.cluster.AddLeaderRegionWithRange(1, "", "", storeIDs[0], storeIDs[1:]...) + r1 := suite.cluster.GetRegion(1) + r1 = r1.Clone(core.WithAddPeer(&metapb.Peer{Id: 6, StoreId: 6, Role: metapb.PeerRole_Learner})) + suite.cluster.PutRegion(r1) + err := suite.ruleManager.SetRules([]*placement.Rule{ + { + GroupID: "pd", + ID: "default", + Index: 100, + Override: true, + Role: placement.Voter, + Count: 5, + IsWitness: false, + }, + { + GroupID: "pd", + ID: "r1", + Index: 100, + Override: false, + Role: placement.Learner, + Count: 1, + IsWitness: false, + }, + }) + suite.NoError(err) + + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) + + // set store 1, 2 to disconnected + suite.cluster.SetStoreDisconnect(1) + suite.cluster.SetStoreDisconnect(2) + suite.cluster.SetStoreDisconnect(3) + + // change rule to 3 replicas + suite.ruleManager.DeleteRuleGroup("pd") + suite.ruleManager.SetRule(&placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 2, + StartKey: []byte{}, + EndKey: []byte{}, + Override: true, + }) + + // remove store 1 from region 1 + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-replaced-orphan-peer", op.Desc()) + suite.Equal(op.Len(), 2) + newLeaderID := op.Step(0).(operator.TransferLeader).ToStore + removedPeerID := op.Step(1).(operator.RemovePeer).FromStore + r1 = suite.cluster.GetRegion(1) + r1 = r1.Clone( + core.WithLeader(r1.GetPeer(newLeaderID)), + core.WithRemoveStorePeer(removedPeerID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 5) + + // remove store 2 from region 1 + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-replaced-orphan-peer", op.Desc()) + suite.Equal(op.Len(), 1) + removedPeerID = op.Step(0).(operator.RemovePeer).FromStore + r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 4) + for _, p := range r1.GetPeers() { + fmt.Println(p.GetStoreId(), p.Role.String()) + } + + // remove store 3 from region 1 + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-replaced-orphan-peer", op.Desc()) + suite.Equal(op.Len(), 1) + removedPeerID = op.Step(0).(operator.RemovePeer).FromStore + r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 3) + + for _, p := range r1.GetPeers() { + suite.NotEqual(p.GetStoreId(), 1) + suite.NotEqual(p.GetStoreId(), 2) + suite.NotEqual(p.GetStoreId(), 3) + } +} + func (suite *ruleCheckerTestSuite) TestPriorityFitHealthWithDifferentRole1() { suite.cluster.SetEnableUseJointConsensus(true) suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) From 04cb434bc4d53cebaca6729fd16a781145d70722 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Mon, 6 Nov 2023 20:20:11 +0800 Subject: [PATCH 2/2] checker: avoid unnecessary remove disconnected peer with multi orphan peers (#7315) close tikv/pd#7249 Signed-off-by: lhy1024 --- server/core/store.go | 3 - server/schedule/checker/rule_checker.go | 28 +- server/schedule/checker/rule_checker_test.go | 450 ++++++++++++------- 3 files changed, 310 insertions(+), 171 deletions(-) diff --git a/server/core/store.go b/server/core/store.go index 247b1e828b3..1556d6d0955 100644 --- a/server/core/store.go +++ b/server/core/store.go @@ -496,9 +496,6 @@ var ( // tikv's store heartbeat for a short time, maybe caused by process restart or // temporary network failure. func (s *StoreInfo) IsDisconnected() bool { - if s == nil { - return true - } return s.DownTime() > storeDisconnectDuration } diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index da707488ea3..469fba82607 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -408,7 +408,11 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg isDisconnectedPeer := func(p *metapb.Peer) bool { // avoid to meet down store when fix orphan peers, // Isdisconnected is more strictly than IsUnhealthy. - return c.cluster.GetStore(p.GetStoreId()).IsDisconnected() + store := c.cluster.GetStore(p.GetStoreId()) + if store == nil { + return true + } + return store.IsDisconnected() } checkDownPeer := func(peers []*metapb.Peer) (*metapb.Peer, bool) { @@ -462,7 +466,7 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg if pinDownPeer.GetIsWitness() || orphanPeer.GetIsWitness() { continue } - // down peer's store should be disconnected + // pinDownPeer's store should be disconnected, because we use more strict judge before. if !isDisconnectedPeer(pinDownPeer) { continue } @@ -477,13 +481,14 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg return operator.CreatePromoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer) case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Learner: return operator.CreateDemoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer) - case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Voter && - isDisconnectedPeer(pinDownPeer) && !dstStore.IsDisconnected(): + case orphanPeerRole == destRole && isDisconnectedPeer(pinDownPeer) && !dstStore.IsDisconnected(): return operator.CreateRemovePeerOperator("remove-replaced-orphan-peer", c.cluster, 0, region, pinDownPeer.GetStoreId()) default: // destRole should not same with orphanPeerRole. if role is same, it fit with orphanPeer should be better than now. // destRole never be leader, so we not consider it. } + } else { + checkerCounter.WithLabelValues("rule_checker", "replace-orphan-peer-no-fit").Inc() } } } @@ -492,18 +497,25 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg // Ref https://github.com/tikv/pd/issues/4045 if len(fit.OrphanPeers) >= 2 { hasHealthPeer := false + var disconnectedPeer *metapb.Peer + for _, orphanPeer := range fit.OrphanPeers { + if isDisconnectedPeer(orphanPeer) { + disconnectedPeer = orphanPeer + break + } + } for _, orphanPeer := range fit.OrphanPeers { if isUnhealthyPeer(orphanPeer.GetId()) { checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc() return operator.CreateRemovePeerOperator("remove-unhealthy-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } - if isDisconnectedPeer(orphanPeer) { - checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc() - return operator.CreateRemovePeerOperator("remove-disconnected-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) - } if hasHealthPeer { // there already exists a healthy orphan peer, so we can remove other orphan Peers. checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc() + // if there exists a disconnected orphan peer, we will pick it to remove firstly. + if disconnectedPeer != nil { + return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, disconnectedPeer.StoreId) + } return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } hasHealthPeer = true diff --git a/server/schedule/checker/rule_checker_test.go b/server/schedule/checker/rule_checker_test.go index 58869ac823c..e8160dc9370 100644 --- a/server/schedule/checker/rule_checker_test.go +++ b/server/schedule/checker/rule_checker_test.go @@ -17,6 +17,8 @@ package checker import ( "context" "fmt" + + "strconv" "testing" "time" @@ -186,7 +188,6 @@ func (suite *ruleCheckerTestSuite) TestFixToManyOrphanPeers() { suite.NotNil(op) suite.Equal("remove-orphan-peer", op.Desc()) suite.Equal(uint64(5), op.Step(0).(operator.RemovePeer).FromStore) - // Case2: // store 4, 5, 6 are orphan peers, and peer on store 3 is down peer. and peer on store 4, 5 are pending. region = suite.cluster.GetRegion(1) @@ -198,6 +199,91 @@ func (suite *ruleCheckerTestSuite) TestFixToManyOrphanPeers() { suite.NotNil(op) suite.Equal("remove-unhealthy-orphan-peer", op.Desc()) suite.Equal(uint64(4), op.Step(0).(operator.RemovePeer).FromStore) + // Case3: + // store 4, 5, 6 are orphan peers, and peer on one of stores is disconnect peer + // we should remove disconnect peer first. + for i := uint64(4); i <= 6; i++ { + region = suite.cluster.GetRegion(1) + suite.cluster.SetStoreDisconnect(i) + region = region.Clone( + core.WithDownPeers([]*pdpb.PeerStats{{Peer: region.GetStorePeer(3), DownSeconds: 60000}}), + core.WithPendingPeers([]*metapb.Peer{region.GetStorePeer(3)})) + suite.cluster.PutRegion(region) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal(i, op.Step(0).(operator.RemovePeer).FromStore) + suite.cluster.SetStoreUp(i) + } + // Case4: + // store 4, 5, 6 are orphan peers, and peer on two of stores is disconnect peer + // we should remove disconnect peer first. + for i := uint64(4); i <= 6; i++ { + region = suite.cluster.GetRegion(1) + suite.cluster.SetStoreDisconnect(4) + suite.cluster.SetStoreDisconnect(5) + suite.cluster.SetStoreDisconnect(6) + suite.cluster.SetStoreUp(i) + region = region.Clone( + core.WithDownPeers([]*pdpb.PeerStats{{Peer: region.GetStorePeer(3), DownSeconds: 60000}}), + core.WithPendingPeers([]*metapb.Peer{region.GetStorePeer(3)})) + suite.cluster.PutRegion(region) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + removedPeerStoreID := op.Step(0).(operator.RemovePeer).FromStore + suite.NotEqual(i, removedPeerStoreID) + region = suite.cluster.GetRegion(1) + newRegion := region.Clone(core.WithRemoveStorePeer(removedPeerStoreID)) + suite.cluster.PutRegion(newRegion) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + removedPeerStoreID = op.Step(0).(operator.RemovePeer).FromStore + suite.NotEqual(i, removedPeerStoreID) + suite.cluster.PutRegion(region) + } +} + +func (suite *ruleCheckerTestSuite) TestFixToManyOrphanPeers2() { + suite.cluster.AddLeaderStore(1, 1) + suite.cluster.AddLeaderStore(2, 1) + suite.cluster.AddLeaderStore(3, 1) + suite.cluster.AddLeaderStore(4, 1) + suite.cluster.AddLeaderStore(5, 1) + suite.cluster.AddRegionWithLearner(1, 1, []uint64{2, 3}, []uint64{4, 5}) + + // Case1: + // store 4, 5 are orphan peers, and peer on one of stores is disconnect peer + // we should remove disconnect peer first. + for i := uint64(4); i <= 5; i++ { + region := suite.cluster.GetRegion(1) + suite.cluster.SetStoreDisconnect(i) + region = region.Clone( + core.WithDownPeers([]*pdpb.PeerStats{{Peer: region.GetStorePeer(3), DownSeconds: 60000}}), + core.WithPendingPeers([]*metapb.Peer{region.GetStorePeer(3)})) + suite.cluster.PutRegion(region) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal(i, op.Step(0).(operator.RemovePeer).FromStore) + suite.cluster.SetStoreUp(i) + } + + // Case2: + // store 4, 5 are orphan peers, and they are disconnect peers + // we should remove the peer on disconnect stores at least. + region := suite.cluster.GetRegion(1) + suite.cluster.SetStoreDisconnect(4) + suite.cluster.SetStoreDisconnect(5) + region = region.Clone( + core.WithDownPeers([]*pdpb.PeerStats{{Peer: region.GetStorePeer(3), DownSeconds: 60000}}), + core.WithPendingPeers([]*metapb.Peer{region.GetStorePeer(3)})) + suite.cluster.PutRegion(region) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal(uint64(4), op.Step(0).(operator.RemovePeer).FromStore) } func (suite *ruleCheckerTestSuite) TestFixOrphanPeers2() { @@ -686,173 +772,217 @@ func (suite *ruleCheckerTestSuite) TestPriorityFixOrphanPeer() { // Ref https://github.com/tikv/pd/issues/7249 https://github.com/tikv/tikv/issues/15799 func (suite *ruleCheckerTestSuite) TestFixOrphanPeerWithDisconnectedStoreAndRuleChanged() { - // init cluster with 5 replicas - suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) - suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) - suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) - suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) - suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) - storeIDs := []uint64{1, 2, 3, 4, 5} - suite.cluster.AddLeaderRegionWithRange(1, "", "", storeIDs[0], storeIDs[1:]...) - rule := &placement.Rule{ - GroupID: "pd", - ID: "default", - Role: placement.Voter, - Count: 5, - StartKey: []byte{}, - EndKey: []byte{}, - } - suite.ruleManager.SetRule(rule) - op := suite.rc.Check(suite.cluster.GetRegion(1)) - suite.Nil(op) - - // set store 1, 2 to disconnected - suite.cluster.SetStoreDisconnect(1) - suite.cluster.SetStoreDisconnect(2) - - // change rule to 3 replicas - rule = &placement.Rule{ - GroupID: "pd", - ID: "default", - Role: placement.Voter, - Count: 3, - StartKey: []byte{}, - EndKey: []byte{}, - Override: true, + // disconnect any two stores and change rule to 3 replicas + stores := []uint64{1, 2, 3, 4, 5} + testCases := [][]uint64{} + for i := 0; i < len(stores); i++ { + for j := i + 1; j < len(stores); j++ { + testCases = append(testCases, []uint64{stores[i], stores[j]}) + } } - suite.ruleManager.SetRule(rule) - - // remove store 1 from region 1 - op = suite.rc.Check(suite.cluster.GetRegion(1)) - suite.NotNil(op) - suite.Equal("remove-replaced-orphan-peer", op.Desc()) - suite.Equal(op.Len(), 2) - newLeaderID := op.Step(0).(operator.TransferLeader).ToStore - removedPeerID := op.Step(1).(operator.RemovePeer).FromStore - r1 := suite.cluster.GetRegion(1) - r1 = r1.Clone( - core.WithLeader(r1.GetPeer(newLeaderID)), - core.WithRemoveStorePeer(removedPeerID)) - suite.cluster.PutRegion(r1) - r1 = suite.cluster.GetRegion(1) - suite.Len(r1.GetPeers(), 4) + for _, leader := range stores { + var followers []uint64 + for i := 0; i < len(stores); i++ { + if stores[i] != leader { + followers = append(followers, stores[i]) + } + } - // remove store 2 from region 1 - op = suite.rc.Check(suite.cluster.GetRegion(1)) - suite.NotNil(op) - suite.Equal("remove-replaced-orphan-peer", op.Desc()) - suite.Equal(op.Len(), 1) - removedPeerID = op.Step(0).(operator.RemovePeer).FromStore - r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) - suite.cluster.PutRegion(r1) - r1 = suite.cluster.GetRegion(1) - suite.Len(r1.GetPeers(), 3) - for _, p := range r1.GetPeers() { - suite.NotEqual(p.GetStoreId(), 1) - suite.NotEqual(p.GetStoreId(), 2) + for _, testCase := range testCases { + // init cluster with 5 replicas + suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) + suite.cluster.AddLeaderRegionWithRange(1, "", "", leader, followers...) + rule := &placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 5, + StartKey: []byte{}, + EndKey: []byte{}, + } + err := suite.ruleManager.SetRule(rule) + suite.NoError(err) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) + + // set two stores to disconnected + suite.cluster.SetStoreDisconnect(testCase[0]) + suite.cluster.SetStoreDisconnect(testCase[1]) + + // change rule to 3 replicas + rule = &placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 3, + StartKey: []byte{}, + EndKey: []byte{}, + Override: true, + } + suite.ruleManager.SetRule(rule) + + // remove peer from region 1 + for j := 1; j <= 2; j++ { + r1 := suite.cluster.GetRegion(1) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Contains(op.Desc(), "orphan") + var removedPeerStoreID uint64 + newLeaderStoreID := r1.GetLeader().GetStoreId() + for i := 0; i < op.Len(); i++ { + if s, ok := op.Step(i).(operator.RemovePeer); ok { + removedPeerStoreID = s.FromStore + } + if s, ok := op.Step(i).(operator.TransferLeader); ok { + newLeaderStoreID = s.ToStore + } + } + suite.NotZero(removedPeerStoreID) + r1 = r1.Clone( + core.WithLeader(r1.GetStorePeer(newLeaderStoreID)), + core.WithRemoveStorePeer(removedPeerStoreID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 5-j) + } + + r1 := suite.cluster.GetRegion(1) + for _, p := range r1.GetPeers() { + suite.NotEqual(p.GetStoreId(), testCase[0]) + suite.NotEqual(p.GetStoreId(), testCase[1]) + } + suite.TearDownTest() + suite.SetupTest() + } } } // Ref https://github.com/tikv/pd/issues/7249 https://github.com/tikv/tikv/issues/15799 -func (suite *ruleCheckerTestSuite) TestFixOrphanPeerWithDisconnectedStoreAndRuleChanged2() { - // init cluster with 5 voters and 1 learner - suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) - suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) - suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) - suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) - suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) - suite.cluster.AddLabelsStore(6, 1, map[string]string{"host": "host6"}) - storeIDs := []uint64{1, 2, 3, 4, 5} - suite.cluster.AddLeaderRegionWithRange(1, "", "", storeIDs[0], storeIDs[1:]...) - r1 := suite.cluster.GetRegion(1) - r1 = r1.Clone(core.WithAddPeer(&metapb.Peer{Id: 6, StoreId: 6, Role: metapb.PeerRole_Learner})) - suite.cluster.PutRegion(r1) - err := suite.ruleManager.SetRules([]*placement.Rule{ - { - GroupID: "pd", - ID: "default", - Index: 100, - Override: true, - Role: placement.Voter, - Count: 5, - IsWitness: false, - }, - { - GroupID: "pd", - ID: "r1", - Index: 100, - Override: false, - Role: placement.Learner, - Count: 1, - IsWitness: false, - }, - }) - suite.NoError(err) - - op := suite.rc.Check(suite.cluster.GetRegion(1)) - suite.Nil(op) - - // set store 1, 2 to disconnected - suite.cluster.SetStoreDisconnect(1) - suite.cluster.SetStoreDisconnect(2) - suite.cluster.SetStoreDisconnect(3) - - // change rule to 3 replicas - suite.ruleManager.DeleteRuleGroup("pd") - suite.ruleManager.SetRule(&placement.Rule{ - GroupID: "pd", - ID: "default", - Role: placement.Voter, - Count: 2, - StartKey: []byte{}, - EndKey: []byte{}, - Override: true, - }) - - // remove store 1 from region 1 - op = suite.rc.Check(suite.cluster.GetRegion(1)) - suite.NotNil(op) - suite.Equal("remove-replaced-orphan-peer", op.Desc()) - suite.Equal(op.Len(), 2) - newLeaderID := op.Step(0).(operator.TransferLeader).ToStore - removedPeerID := op.Step(1).(operator.RemovePeer).FromStore - r1 = suite.cluster.GetRegion(1) - r1 = r1.Clone( - core.WithLeader(r1.GetPeer(newLeaderID)), - core.WithRemoveStorePeer(removedPeerID)) - suite.cluster.PutRegion(r1) - r1 = suite.cluster.GetRegion(1) - suite.Len(r1.GetPeers(), 5) - - // remove store 2 from region 1 - op = suite.rc.Check(suite.cluster.GetRegion(1)) - suite.NotNil(op) - suite.Equal("remove-replaced-orphan-peer", op.Desc()) - suite.Equal(op.Len(), 1) - removedPeerID = op.Step(0).(operator.RemovePeer).FromStore - r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) - suite.cluster.PutRegion(r1) - r1 = suite.cluster.GetRegion(1) - suite.Len(r1.GetPeers(), 4) - for _, p := range r1.GetPeers() { - fmt.Println(p.GetStoreId(), p.Role.String()) +func (suite *ruleCheckerTestSuite) TestFixOrphanPeerWithDisconnectedStoreAndRuleChangedWithLearner() { + // disconnect any three stores and change rule to 3 replicas + // and there is a learner in the disconnected store. + stores := []uint64{1, 2, 3, 4, 5, 6} + testCases := [][]uint64{} + for i := 0; i < len(stores); i++ { + for j := i + 1; j < len(stores); j++ { + for k := j + 1; k < len(stores); k++ { + testCases = append(testCases, []uint64{stores[i], stores[j], stores[k]}) + } + } } + for _, leader := range stores { + var followers []uint64 + for i := 0; i < len(stores); i++ { + if stores[i] != leader { + followers = append(followers, stores[i]) + } + } - // remove store 3 from region 1 - op = suite.rc.Check(suite.cluster.GetRegion(1)) - suite.NotNil(op) - suite.Equal("remove-replaced-orphan-peer", op.Desc()) - suite.Equal(op.Len(), 1) - removedPeerID = op.Step(0).(operator.RemovePeer).FromStore - r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) - suite.cluster.PutRegion(r1) - r1 = suite.cluster.GetRegion(1) - suite.Len(r1.GetPeers(), 3) - - for _, p := range r1.GetPeers() { - suite.NotEqual(p.GetStoreId(), 1) - suite.NotEqual(p.GetStoreId(), 2) - suite.NotEqual(p.GetStoreId(), 3) + for _, testCase := range testCases { + for _, learnerStore := range testCase { + if learnerStore == leader { + continue + } + voterFollowers := []uint64{} + for _, follower := range followers { + if follower != learnerStore { + voterFollowers = append(voterFollowers, follower) + } + } + // init cluster with 5 voters and 1 learner + suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) + suite.cluster.AddLabelsStore(6, 1, map[string]string{"host": "host6"}) + suite.cluster.AddLeaderRegionWithRange(1, "", "", leader, voterFollowers...) + err := suite.ruleManager.SetRules([]*placement.Rule{ + { + GroupID: "pd", + ID: "default", + Index: 100, + Override: true, + Role: placement.Voter, + Count: 5, + IsWitness: false, + }, + { + GroupID: "pd", + ID: "r1", + Index: 100, + Override: false, + Role: placement.Learner, + Count: 1, + IsWitness: false, + LabelConstraints: []placement.LabelConstraint{ + {Key: "host", Op: "in", Values: []string{"host" + strconv.FormatUint(learnerStore, 10)}}, + }, + }, + }) + suite.NoError(err) + r1 := suite.cluster.GetRegion(1) + r1 = r1.Clone(core.WithAddPeer(&metapb.Peer{Id: 12, StoreId: learnerStore, Role: metapb.PeerRole_Learner})) + suite.cluster.PutRegion(r1) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) + + // set three stores to disconnected + suite.cluster.SetStoreDisconnect(testCase[0]) + suite.cluster.SetStoreDisconnect(testCase[1]) + suite.cluster.SetStoreDisconnect(testCase[2]) + + // change rule to 3 replicas + suite.ruleManager.DeleteRule("pd", "r1") + suite.ruleManager.SetRule(&placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 3, + StartKey: []byte{}, + EndKey: []byte{}, + Override: true, + }) + + // remove peer from region 1 + for j := 1; j <= 3; j++ { + r1 := suite.cluster.GetRegion(1) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Contains(op.Desc(), "orphan") + var removedPeerStroeID uint64 + newLeaderStoreID := r1.GetLeader().GetStoreId() + for i := 0; i < op.Len(); i++ { + if s, ok := op.Step(i).(operator.RemovePeer); ok { + removedPeerStroeID = s.FromStore + } + if s, ok := op.Step(i).(operator.TransferLeader); ok { + newLeaderStoreID = s.ToStore + } + } + suite.NotZero(removedPeerStroeID) + r1 = r1.Clone( + core.WithLeader(r1.GetStorePeer(newLeaderStoreID)), + core.WithRemoveStorePeer(removedPeerStroeID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 6-j) + } + + r1 = suite.cluster.GetRegion(1) + for _, p := range r1.GetPeers() { + suite.NotEqual(p.GetStoreId(), testCase[0]) + suite.NotEqual(p.GetStoreId(), testCase[1]) + suite.NotEqual(p.GetStoreId(), testCase[2]) + } + suite.TearDownTest() + suite.SetupTest() + } + } } }