From dec665fe9c3760f122282e58b3569ffdda78cf2f Mon Sep 17 00:00:00 2001 From: weekface Date: Tue, 9 Apr 2019 17:27:40 +0800 Subject: [PATCH] fix tikv failover (#368) * fix tikv failover * add TODO --- charts/tidb-cluster/values.yaml | 2 +- charts/tidb-operator/values.yaml | 2 + cmd/controller-manager/main.go | 5 +- go.sum | 4 ++ .../tidbcluster/tidb_cluster_controller.go | 3 +- .../tidb_cluster_controller_test.go | 1 + pkg/manager/member/tikv_failover.go | 14 ++--- pkg/manager/member/tikv_failover_test.go | 56 ++++--------------- tests/failover.go | 3 + 9 files changed, 32 insertions(+), 58 deletions(-) diff --git a/charts/tidb-cluster/values.yaml b/charts/tidb-cluster/values.yaml index 11b7545b8b..7e5228603a 100644 --- a/charts/tidb-cluster/values.yaml +++ b/charts/tidb-cluster/values.yaml @@ -56,7 +56,7 @@ pd: # maxStoreDownTime is how long a store will be considered `down` when disconnected # if a store is considered `down`, the regions will be migrated to other stores - maxStoreDownTime: 1h + maxStoreDownTime: 30m # maxReplicas is the number of replicas for each region maxReplicas: 3 resources: diff --git a/charts/tidb-operator/values.yaml b/charts/tidb-operator/values.yaml index 2de213e0e9..bde3d7b254 100644 --- a/charts/tidb-operator/values.yaml +++ b/charts/tidb-operator/values.yaml @@ -32,6 +32,8 @@ controllerManager: autoFailover: false # pd failover period default(5m) pdFailoverPeriod: 5m + # tikv failover period default(5m) + tikvFailoverPeriod: 5m # tidb failover period default(5m) tidbFailoverPeriod: 5m diff --git a/cmd/controller-manager/main.go b/cmd/controller-manager/main.go index 086584b7b8..ded6a8d3b9 100644 --- a/cmd/controller-manager/main.go +++ b/cmd/controller-manager/main.go @@ -43,6 +43,7 @@ var ( workers int autoFailover bool pdFailoverPeriod time.Duration + tikvFailoverPeriod time.Duration tidbFailoverPeriod time.Duration leaseDuration = 15 * time.Second renewDuration = 5 * time.Second @@ -59,6 +60,7 @@ func init() { flag.StringVar(&controller.DefaultStorageClassName, "default-storage-class-name", "standard", "Default storage class name") flag.BoolVar(&autoFailover, "auto-failover", false, "Auto failover") flag.DurationVar(&pdFailoverPeriod, "pd-failover-period", time.Duration(5*time.Minute), "PD failover period default(5m)") + flag.DurationVar(&tikvFailoverPeriod, "tikv-failover-period", time.Duration(5*time.Minute), "TiKV failover period default(5m)") flag.DurationVar(&tidbFailoverPeriod, "tidb-failover-period", time.Duration(5*time.Minute), "TiDB failover period") flag.Parse() @@ -120,8 +122,7 @@ func main() { }, } - tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory, autoFailover, pdFailoverPeriod, tidbFailoverPeriod) - + tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory, autoFailover, pdFailoverPeriod, tikvFailoverPeriod, tidbFailoverPeriod) controllerCtx, cancel := context.WithCancel(context.Background()) defer cancel() go informerFactory.Start(controllerCtx.Done()) diff --git a/go.sum b/go.sum index f89194af59..8c7f444514 100644 --- a/go.sum +++ b/go.sum @@ -202,10 +202,14 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180807162357-acbc56fc7007 h1:UnHxDq9ldm4vol94wlSWDF3SU4IyC8IWVWtg266CzoY= golang.org/x/sys v0.0.0-20180807162357-acbc56fc7007/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2 h1:+DCIGbF/swA92ohVg0//6X2IVY3KZs6p9mix0ziNYJM= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20190403183509-8a44e74612bc h1:9OQUxGJQk/Rt2SmlbFsqnsyFaX1YiLbBfUJezBkCaa0= +golang.org/x/tools v0.0.0-20190403183509-8a44e74612bc/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= google.golang.org/appengine v1.1.0 h1:igQkv0AAhEIvTEpD5LIpAfav2eeVO9HBTjvKHVJPRSs= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= diff --git a/pkg/controller/tidbcluster/tidb_cluster_controller.go b/pkg/controller/tidbcluster/tidb_cluster_controller.go index 677802af01..38eb2f77b1 100644 --- a/pkg/controller/tidbcluster/tidb_cluster_controller.go +++ b/pkg/controller/tidbcluster/tidb_cluster_controller.go @@ -73,6 +73,7 @@ func NewController( kubeInformerFactory kubeinformers.SharedInformerFactory, autoFailover bool, pdFailoverPeriod time.Duration, + tikvFailoverPeriod time.Duration, tidbFailoverPeriod time.Duration, ) *Controller { eventBroadcaster := record.NewBroadcaster() @@ -100,7 +101,7 @@ func NewController( pdScaler := mm.NewPDScaler(pdControl, pvcInformer.Lister(), pvcControl) tikvScaler := mm.NewTiKVScaler(pdControl, pvcInformer.Lister(), pvcControl, podInformer.Lister()) pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister()) - tikvFailover := mm.NewTiKVFailover(pdControl) + tikvFailover := mm.NewTiKVFailover(tikvFailoverPeriod) tidbFailover := mm.NewTiDBFailover(tidbFailoverPeriod) pdUpgrader := mm.NewPDUpgrader(pdControl, podControl, podInformer.Lister()) tikvUpgrader := mm.NewTiKVUpgrader(pdControl, podControl, podInformer.Lister()) diff --git a/pkg/controller/tidbcluster/tidb_cluster_controller_test.go b/pkg/controller/tidbcluster/tidb_cluster_controller_test.go index c9a442bb0a..f5745f9da0 100644 --- a/pkg/controller/tidbcluster/tidb_cluster_controller_test.go +++ b/pkg/controller/tidbcluster/tidb_cluster_controller_test.go @@ -233,6 +233,7 @@ func newFakeTidbClusterController() (*Controller, cache.Indexer, cache.Indexer) autoFailover, 5*time.Minute, 5*time.Minute, + 5*time.Minute, ) tcc.tcListerSynced = alwaysReady tcc.setListerSynced = alwaysReady diff --git a/pkg/manager/member/tikv_failover.go b/pkg/manager/member/tikv_failover.go index 6724cd4689..8f39e1d8fa 100644 --- a/pkg/manager/member/tikv_failover.go +++ b/pkg/manager/member/tikv_failover.go @@ -17,30 +17,24 @@ import ( "time" "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" - "github.com/pingcap/tidb-operator/pkg/controller" ) type tikvFailover struct { - pdControl controller.PDControlInterface + tikvFailoverPeriod time.Duration } // NewTiKVFailover returns a tikv Failover -func NewTiKVFailover(pdControl controller.PDControlInterface) Failover { - return &tikvFailover{pdControl} +func NewTiKVFailover(tikvFailoverPeriod time.Duration) Failover { + return &tikvFailover{tikvFailoverPeriod} } func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error { - cfg, err := tf.pdControl.GetPDClient(tc).GetConfig() - if err != nil { - return err - } - for storeID, store := range tc.Status.TiKV.Stores { podName := store.PodName if store.LastTransitionTime.IsZero() { continue } - deadline := store.LastTransitionTime.Add(cfg.Schedule.MaxStoreDownTime.Duration) + deadline := store.LastTransitionTime.Add(tf.tikvFailoverPeriod) exist := false for _, failureStore := range tc.Status.TiKV.FailureStores { if failureStore.PodName == podName { diff --git a/pkg/manager/member/tikv_failover_test.go b/pkg/manager/member/tikv_failover_test.go index b5be0ca0a8..2c82de803e 100644 --- a/pkg/manager/member/tikv_failover_test.go +++ b/pkg/manager/member/tikv_failover_test.go @@ -14,15 +14,11 @@ package member import ( - "fmt" "testing" "time" . "github.com/onsi/gomega" - "github.com/pingcap/pd/pkg/typeutil" - "github.com/pingcap/pd/server" "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" - "github.com/pingcap/tidb-operator/pkg/controller" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -30,28 +26,16 @@ func TestTiKVFailoverFailover(t *testing.T) { g := NewGomegaWithT(t) type testcase struct { - name string - update func(*v1alpha1.TidbCluster) - getCfgErr bool - err bool - expectFn func(*v1alpha1.TidbCluster) + name string + update func(*v1alpha1.TidbCluster) + err bool + expectFn func(*v1alpha1.TidbCluster) } testFn := func(test *testcase, t *testing.T) { t.Log(test.name) tc := newTidbClusterForPD() test.update(tc) - tikvFailover, fakePDControl := newFakeTiKVFailover() - pdClient := controller.NewFakePDClient() - fakePDControl.SetPDClient(tc, pdClient) - - pdClient.AddReaction(controller.GetConfigActionType, func(action *controller.Action) (interface{}, error) { - if test.getCfgErr { - return nil, fmt.Errorf("get config failed") - } - return &server.Config{ - Schedule: server.ScheduleConfig{MaxStoreDownTime: typeutil.Duration{Duration: 1 * time.Hour}}, - }, nil - }) + tikvFailover := newFakeTiKVFailover() err := tikvFailover.Failover(tc) if test.err { @@ -79,23 +63,12 @@ func TestTiKVFailoverFailover(t *testing.T) { }, } }, - getCfgErr: false, - err: false, + err: false, expectFn: func(tc *v1alpha1.TidbCluster) { g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(2)) }, }, - { - name: "get config failed", - update: func(*v1alpha1.TidbCluster) {}, - getCfgErr: true, - err: true, - expectFn: func(tc *v1alpha1.TidbCluster) { - g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) - g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0)) - }, - }, { name: "tikv state is not Down", update: func(tc *v1alpha1.TidbCluster) { @@ -103,8 +76,7 @@ func TestTiKVFailoverFailover(t *testing.T) { "1": {State: v1alpha1.TiKVStateUp, PodName: "tikv-1"}, } }, - getCfgErr: false, - err: false, + err: false, expectFn: func(tc *v1alpha1.TidbCluster) { g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0)) @@ -121,8 +93,7 @@ func TestTiKVFailoverFailover(t *testing.T) { }, } }, - getCfgErr: false, - err: false, + err: false, expectFn: func(tc *v1alpha1.TidbCluster) { g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0)) @@ -138,8 +109,7 @@ func TestTiKVFailoverFailover(t *testing.T) { }, } }, - getCfgErr: false, - err: false, + err: false, expectFn: func(tc *v1alpha1.TidbCluster) { g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0)) @@ -162,8 +132,7 @@ func TestTiKVFailoverFailover(t *testing.T) { }, } }, - getCfgErr: false, - err: false, + err: false, expectFn: func(tc *v1alpha1.TidbCluster) { g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(1)) @@ -175,7 +144,6 @@ func TestTiKVFailoverFailover(t *testing.T) { } } -func newFakeTiKVFailover() (*tikvFailover, *controller.FakePDControl) { - pdControl := controller.NewFakePDControl() - return &tikvFailover{pdControl}, pdControl +func newFakeTiKVFailover() *tikvFailover { + return &tikvFailover{1 * time.Hour} } diff --git a/tests/failover.go b/tests/failover.go index ea1631909d..59aa87cce4 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -214,6 +214,9 @@ func (oa *operatorActions) pdFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) return false } +// TODO we should confirm the tombstone exists, important!!!!!! +// for example: offline the same pod again and again, and see it in the tombstone stores +// offline two pods, and see them in the tombstone stores func (oa *operatorActions) tikvFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool { failure := false for _, failureStore := range tc.Status.TiKV.FailureStores {