diff --git a/server/cluster/unsafe_recovery_controller.go b/server/cluster/unsafe_recovery_controller.go index 7d55c5d8f6d..0fd45703040 100644 --- a/server/cluster/unsafe_recovery_controller.go +++ b/server/cluster/unsafe_recovery_controller.go @@ -998,6 +998,21 @@ func (u *unsafeRecoveryController) generateForceLeaderPlan(newestRegionTree *reg storeRecoveryPlan.ForceLeader.FailedStores = append(storeRecoveryPlan.ForceLeader.FailedStores, store) } } + if u.autoDetect { + // For auto detect, the failedStores is empty. So need to add the detected failed store to the list + for _, peer := range u.getFailedPeers(leader.Region()) { + found := false + for _, store := range storeRecoveryPlan.ForceLeader.FailedStores { + if store == peer.StoreId { + found = true + break + } + } + if !found { + storeRecoveryPlan.ForceLeader.FailedStores = append(storeRecoveryPlan.ForceLeader.FailedStores, peer.StoreId) + } + } + } storeRecoveryPlan.ForceLeader.EnterForceLeaders = append(storeRecoveryPlan.ForceLeader.EnterForceLeaders, region.GetId()) u.recordAffectedRegion(leader.Region()) hasPlan = true diff --git a/server/cluster/unsafe_recovery_controller_test.go b/server/cluster/unsafe_recovery_controller_test.go index 4165c0b2721..1baf1466853 100644 --- a/server/cluster/unsafe_recovery_controller_test.go +++ b/server/cluster/unsafe_recovery_controller_test.go @@ -40,6 +40,44 @@ func newStoreHeartbeat(storeID uint64, report *pdpb.StoreReport) *pdpb.StoreHear } } +func hasQuorum(region *metapb.Region, failedStores []uint64) bool { + hasQuorum := func(voters []*metapb.Peer) bool { + numFailedVoters := 0 + numLiveVoters := 0 + + for _, voter := range voters { + found := false + for _, store := range failedStores { + if store == voter.GetStoreId() { + found = true + break + } + } + if found { + numFailedVoters += 1 + } else { + numLiveVoters += 1 + } + } + return numFailedVoters < numLiveVoters + } + + // consider joint consensus + var incomingVoters []*metapb.Peer + var outgoingVoters []*metapb.Peer + + for _, peer := range region.Peers { + if peer.Role == metapb.PeerRole_Voter || peer.Role == metapb.PeerRole_IncomingVoter { + incomingVoters = append(incomingVoters, peer) + } + if peer.Role == metapb.PeerRole_Voter || peer.Role == metapb.PeerRole_DemotingVoter { + outgoingVoters = append(outgoingVoters, peer) + } + } + + return hasQuorum(incomingVoters) && hasQuorum(outgoingVoters) +} + func applyRecoveryPlan(re *require.Assertions, storeID uint64, storeReports map[uint64]*pdpb.StoreReport, resp *pdpb.StoreHeartbeatResponse) { plan := resp.GetRecoveryPlan() if plan == nil { @@ -55,6 +93,9 @@ func applyRecoveryPlan(re *require.Assertions, storeID uint64, storeReports map[ for _, report := range reports.PeerReports { region := report.GetRegionState().GetRegion() if region.GetId() == forceLeader { + if hasQuorum(region, forceLeaders.GetFailedStores()) { + re.FailNow("should not enter force leader when quorum is still alive") + } report.IsForceLeader = true break }