Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix unsafe recovery auto detect mode (#5754) #5756

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions server/cluster/unsafe_recovery_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -998,6 +998,21 @@ func (u *unsafeRecoveryController) generateForceLeaderPlan(newestRegionTree *reg
storeRecoveryPlan.ForceLeader.FailedStores = append(storeRecoveryPlan.ForceLeader.FailedStores, store)
}
}
if u.autoDetect {
// For auto detect, the failedStores is empty. So need to add the detected failed store to the list
for _, peer := range u.getFailedPeers(leader.Region()) {
found := false
for _, store := range storeRecoveryPlan.ForceLeader.FailedStores {
if store == peer.StoreId {
found = true
break
}
}
if !found {
storeRecoveryPlan.ForceLeader.FailedStores = append(storeRecoveryPlan.ForceLeader.FailedStores, peer.StoreId)
}
}
}
storeRecoveryPlan.ForceLeader.EnterForceLeaders = append(storeRecoveryPlan.ForceLeader.EnterForceLeaders, region.GetId())
u.recordAffectedRegion(leader.Region())
hasPlan = true
Expand Down
41 changes: 41 additions & 0 deletions server/cluster/unsafe_recovery_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,44 @@ func newStoreHeartbeat(storeID uint64, report *pdpb.StoreReport) *pdpb.StoreHear
}
}

func hasQuorum(region *metapb.Region, failedStores []uint64) bool {
hasQuorum := func(voters []*metapb.Peer) bool {
numFailedVoters := 0
numLiveVoters := 0

for _, voter := range voters {
found := false
for _, store := range failedStores {
if store == voter.GetStoreId() {
found = true
break
}
}
if found {
numFailedVoters += 1
} else {
numLiveVoters += 1
}
}
return numFailedVoters < numLiveVoters
}

// consider joint consensus
var incomingVoters []*metapb.Peer
var outgoingVoters []*metapb.Peer

for _, peer := range region.Peers {
if peer.Role == metapb.PeerRole_Voter || peer.Role == metapb.PeerRole_IncomingVoter {
incomingVoters = append(incomingVoters, peer)
}
if peer.Role == metapb.PeerRole_Voter || peer.Role == metapb.PeerRole_DemotingVoter {
outgoingVoters = append(outgoingVoters, peer)
}
}

return hasQuorum(incomingVoters) && hasQuorum(outgoingVoters)
}

func applyRecoveryPlan(re *require.Assertions, storeID uint64, storeReports map[uint64]*pdpb.StoreReport, resp *pdpb.StoreHeartbeatResponse) {
plan := resp.GetRecoveryPlan()
if plan == nil {
Expand All @@ -55,6 +93,9 @@ func applyRecoveryPlan(re *require.Assertions, storeID uint64, storeReports map[
for _, report := range reports.PeerReports {
region := report.GetRegionState().GetRegion()
if region.GetId() == forceLeader {
if hasQuorum(region, forceLeaders.GetFailedStores()) {
re.FailNow("should not enter force leader when quorum is still alive")
}
report.IsForceLeader = true
break
}
Expand Down