diff --git a/pkg/apis/pingcap/v1alpha1/defaulting/tidbcluster.go b/pkg/apis/pingcap/v1alpha1/defaulting/tidbcluster.go index 84533f80bc..a9b76529ef 100644 --- a/pkg/apis/pingcap/v1alpha1/defaulting/tidbcluster.go +++ b/pkg/apis/pingcap/v1alpha1/defaulting/tidbcluster.go @@ -126,4 +126,7 @@ func setTiFlashSpecDefault(tc *v1alpha1.TidbCluster) { tc.Spec.TiFlash.BaseImage = defaultTiFlashImage } } + if tc.Spec.TiFlash.MaxFailoverCount == nil { + tc.Spec.TiFlash.MaxFailoverCount = pointer.Int32Ptr(3) + } } diff --git a/pkg/controller/tidbcluster/tidb_cluster_controller.go b/pkg/controller/tidbcluster/tidb_cluster_controller.go index 3579906fef..498dea46a2 100644 --- a/pkg/controller/tidbcluster/tidb_cluster_controller.go +++ b/pkg/controller/tidbcluster/tidb_cluster_controller.go @@ -109,7 +109,7 @@ func NewController( pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister(), recorder) tikvFailover := mm.NewTiKVFailover(tikvFailoverPeriod, recorder) tidbFailover := mm.NewTiDBFailover(tidbFailoverPeriod, recorder) - tiflashFailover := mm.NewTiFlashFailover(tiflashFailoverPeriod) + tiflashFailover := mm.NewTiFlashFailover(tiflashFailoverPeriod, recorder) pdUpgrader := mm.NewPDUpgrader(pdControl, podControl, podInformer.Lister()) tikvUpgrader := mm.NewTiKVUpgrader(pdControl, podControl, podInformer.Lister()) tiflashUpgrader := mm.NewTiFlashUpgrader(pdControl, podControl, podInformer.Lister()) diff --git a/pkg/manager/member/tiflash_failover.go b/pkg/manager/member/tiflash_failover.go index 08ffa919dd..569f9af7bb 100644 --- a/pkg/manager/member/tiflash_failover.go +++ b/pkg/manager/member/tiflash_failover.go @@ -14,22 +14,63 @@ package member import ( + "fmt" "time" "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/record" + "k8s.io/klog" ) type tiflashFailover struct { tiflashFailoverPeriod time.Duration + recorder record.EventRecorder } // NewTiFlashFailover returns a tiflash Failover -func NewTiFlashFailover(tiflashFailoverPeriod time.Duration) Failover { - return &tiflashFailover{tiflashFailoverPeriod} +func NewTiFlashFailover(tiflashFailoverPeriod time.Duration, recorder record.EventRecorder) Failover { + return &tiflashFailover{tiflashFailoverPeriod, recorder} } -// TODO: Finish the failover logic func (tff *tiflashFailover) Failover(tc *v1alpha1.TidbCluster) error { + ns := tc.GetNamespace() + tcName := tc.GetName() + + for storeID, store := range tc.Status.TiFlash.Stores { + podName := store.PodName + if store.LastTransitionTime.IsZero() { + continue + } + deadline := store.LastTransitionTime.Add(tff.tiflashFailoverPeriod) + exist := false + for _, failureStore := range tc.Status.TiFlash.FailureStores { + if failureStore.PodName == podName { + exist = true + break + } + } + if store.State == v1alpha1.TiKVStateDown && time.Now().After(deadline) && !exist { + if tc.Status.TiFlash.FailureStores == nil { + tc.Status.TiFlash.FailureStores = map[string]v1alpha1.TiKVFailureStore{} + } + if tc.Spec.TiFlash.MaxFailoverCount != nil && *tc.Spec.TiFlash.MaxFailoverCount > 0 { + maxFailoverCount := *tc.Spec.TiFlash.MaxFailoverCount + if len(tc.Status.TiFlash.FailureStores) >= int(maxFailoverCount) { + klog.Warningf("%s/%s TiFlash failure stores count reached the limit: %d", ns, tcName, tc.Spec.TiFlash.MaxFailoverCount) + return nil + } + tc.Status.TiFlash.FailureStores[storeID] = v1alpha1.TiKVFailureStore{ + PodName: podName, + StoreID: store.ID, + CreatedAt: metav1.Now(), + } + msg := fmt.Sprintf("store [%s] is Down", store.ID) + tff.recorder.Event(tc, corev1.EventTypeWarning, unHealthEventReason, fmt.Sprintf(unHealthEventMsgPattern, "tiflash", podName, msg)) + } + } + } return nil }