Skip to content

Commit

Permalink
fix tikv failover (#368)
Browse files Browse the repository at this point in the history
* fix tikv failover

* add TODO
  • Loading branch information
weekface authored and tennix committed Apr 9, 2019
1 parent 6378a2d commit dec665f
Show file tree
Hide file tree
Showing 9 changed files with 32 additions and 58 deletions.
2 changes: 1 addition & 1 deletion charts/tidb-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ pd:

# maxStoreDownTime is how long a store will be considered `down` when disconnected
# if a store is considered `down`, the regions will be migrated to other stores
maxStoreDownTime: 1h
maxStoreDownTime: 30m
# maxReplicas is the number of replicas for each region
maxReplicas: 3
resources:
Expand Down
2 changes: 2 additions & 0 deletions charts/tidb-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ controllerManager:
autoFailover: false
# pd failover period default(5m)
pdFailoverPeriod: 5m
# tikv failover period default(5m)
tikvFailoverPeriod: 5m
# tidb failover period default(5m)
tidbFailoverPeriod: 5m

Expand Down
5 changes: 3 additions & 2 deletions cmd/controller-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ var (
workers int
autoFailover bool
pdFailoverPeriod time.Duration
tikvFailoverPeriod time.Duration
tidbFailoverPeriod time.Duration
leaseDuration = 15 * time.Second
renewDuration = 5 * time.Second
Expand All @@ -59,6 +60,7 @@ func init() {
flag.StringVar(&controller.DefaultStorageClassName, "default-storage-class-name", "standard", "Default storage class name")
flag.BoolVar(&autoFailover, "auto-failover", false, "Auto failover")
flag.DurationVar(&pdFailoverPeriod, "pd-failover-period", time.Duration(5*time.Minute), "PD failover period default(5m)")
flag.DurationVar(&tikvFailoverPeriod, "tikv-failover-period", time.Duration(5*time.Minute), "TiKV failover period default(5m)")
flag.DurationVar(&tidbFailoverPeriod, "tidb-failover-period", time.Duration(5*time.Minute), "TiDB failover period")

flag.Parse()
Expand Down Expand Up @@ -120,8 +122,7 @@ func main() {
},
}

tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory, autoFailover, pdFailoverPeriod, tidbFailoverPeriod)

tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory, autoFailover, pdFailoverPeriod, tikvFailoverPeriod, tidbFailoverPeriod)
controllerCtx, cancel := context.WithCancel(context.Background())
defer cancel()
go informerFactory.Start(controllerCtx.Done())
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,14 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180807162357-acbc56fc7007 h1:UnHxDq9ldm4vol94wlSWDF3SU4IyC8IWVWtg266CzoY=
golang.org/x/sys v0.0.0-20180807162357-acbc56fc7007/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2 h1:+DCIGbF/swA92ohVg0//6X2IVY3KZs6p9mix0ziNYJM=
golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20190403183509-8a44e74612bc h1:9OQUxGJQk/Rt2SmlbFsqnsyFaX1YiLbBfUJezBkCaa0=
golang.org/x/tools v0.0.0-20190403183509-8a44e74612bc/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
google.golang.org/appengine v1.1.0 h1:igQkv0AAhEIvTEpD5LIpAfav2eeVO9HBTjvKHVJPRSs=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
Expand Down
3 changes: 2 additions & 1 deletion pkg/controller/tidbcluster/tidb_cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ func NewController(
kubeInformerFactory kubeinformers.SharedInformerFactory,
autoFailover bool,
pdFailoverPeriod time.Duration,
tikvFailoverPeriod time.Duration,
tidbFailoverPeriod time.Duration,
) *Controller {
eventBroadcaster := record.NewBroadcaster()
Expand Down Expand Up @@ -100,7 +101,7 @@ func NewController(
pdScaler := mm.NewPDScaler(pdControl, pvcInformer.Lister(), pvcControl)
tikvScaler := mm.NewTiKVScaler(pdControl, pvcInformer.Lister(), pvcControl, podInformer.Lister())
pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister())
tikvFailover := mm.NewTiKVFailover(pdControl)
tikvFailover := mm.NewTiKVFailover(tikvFailoverPeriod)
tidbFailover := mm.NewTiDBFailover(tidbFailoverPeriod)
pdUpgrader := mm.NewPDUpgrader(pdControl, podControl, podInformer.Lister())
tikvUpgrader := mm.NewTiKVUpgrader(pdControl, podControl, podInformer.Lister())
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/tidbcluster/tidb_cluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ func newFakeTidbClusterController() (*Controller, cache.Indexer, cache.Indexer)
autoFailover,
5*time.Minute,
5*time.Minute,
5*time.Minute,
)
tcc.tcListerSynced = alwaysReady
tcc.setListerSynced = alwaysReady
Expand Down
14 changes: 4 additions & 10 deletions pkg/manager/member/tikv_failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,24 @@ import (
"time"

"github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1"
"github.com/pingcap/tidb-operator/pkg/controller"
)

type tikvFailover struct {
pdControl controller.PDControlInterface
tikvFailoverPeriod time.Duration
}

// NewTiKVFailover returns a tikv Failover
func NewTiKVFailover(pdControl controller.PDControlInterface) Failover {
return &tikvFailover{pdControl}
func NewTiKVFailover(tikvFailoverPeriod time.Duration) Failover {
return &tikvFailover{tikvFailoverPeriod}
}

func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error {
cfg, err := tf.pdControl.GetPDClient(tc).GetConfig()
if err != nil {
return err
}

for storeID, store := range tc.Status.TiKV.Stores {
podName := store.PodName
if store.LastTransitionTime.IsZero() {
continue
}
deadline := store.LastTransitionTime.Add(cfg.Schedule.MaxStoreDownTime.Duration)
deadline := store.LastTransitionTime.Add(tf.tikvFailoverPeriod)
exist := false
for _, failureStore := range tc.Status.TiKV.FailureStores {
if failureStore.PodName == podName {
Expand Down
56 changes: 12 additions & 44 deletions pkg/manager/member/tikv_failover_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,44 +14,28 @@
package member

import (
"fmt"
"testing"
"time"

. "github.com/onsi/gomega"
"github.com/pingcap/pd/pkg/typeutil"
"github.com/pingcap/pd/server"
"github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1"
"github.com/pingcap/tidb-operator/pkg/controller"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestTiKVFailoverFailover(t *testing.T) {
g := NewGomegaWithT(t)

type testcase struct {
name string
update func(*v1alpha1.TidbCluster)
getCfgErr bool
err bool
expectFn func(*v1alpha1.TidbCluster)
name string
update func(*v1alpha1.TidbCluster)
err bool
expectFn func(*v1alpha1.TidbCluster)
}
testFn := func(test *testcase, t *testing.T) {
t.Log(test.name)
tc := newTidbClusterForPD()
test.update(tc)
tikvFailover, fakePDControl := newFakeTiKVFailover()
pdClient := controller.NewFakePDClient()
fakePDControl.SetPDClient(tc, pdClient)

pdClient.AddReaction(controller.GetConfigActionType, func(action *controller.Action) (interface{}, error) {
if test.getCfgErr {
return nil, fmt.Errorf("get config failed")
}
return &server.Config{
Schedule: server.ScheduleConfig{MaxStoreDownTime: typeutil.Duration{Duration: 1 * time.Hour}},
}, nil
})
tikvFailover := newFakeTiKVFailover()

err := tikvFailover.Failover(tc)
if test.err {
Expand Down Expand Up @@ -79,32 +63,20 @@ func TestTiKVFailoverFailover(t *testing.T) {
},
}
},
getCfgErr: false,
err: false,
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(2))
},
},
{
name: "get config failed",
update: func(*v1alpha1.TidbCluster) {},
getCfgErr: true,
err: true,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0))
},
},
{
name: "tikv state is not Down",
update: func(tc *v1alpha1.TidbCluster) {
tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{
"1": {State: v1alpha1.TiKVStateUp, PodName: "tikv-1"},
}
},
getCfgErr: false,
err: false,
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0))
Expand All @@ -121,8 +93,7 @@ func TestTiKVFailoverFailover(t *testing.T) {
},
}
},
getCfgErr: false,
err: false,
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0))
Expand All @@ -138,8 +109,7 @@ func TestTiKVFailoverFailover(t *testing.T) {
},
}
},
getCfgErr: false,
err: false,
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0))
Expand All @@ -162,8 +132,7 @@ func TestTiKVFailoverFailover(t *testing.T) {
},
}
},
getCfgErr: false,
err: false,
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(1))
Expand All @@ -175,7 +144,6 @@ func TestTiKVFailoverFailover(t *testing.T) {
}
}

func newFakeTiKVFailover() (*tikvFailover, *controller.FakePDControl) {
pdControl := controller.NewFakePDControl()
return &tikvFailover{pdControl}, pdControl
func newFakeTiKVFailover() *tikvFailover {
return &tikvFailover{1 * time.Hour}
}
3 changes: 3 additions & 0 deletions tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,9 @@ func (oa *operatorActions) pdFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster)
return false
}

// TODO we should confirm the tombstone exists, important!!!!!!
// for example: offline the same pod again and again, and see it in the tombstone stores
// offline two pods, and see them in the tombstone stores
func (oa *operatorActions) tikvFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool {
failure := false
for _, failureStore := range tc.Status.TiKV.FailureStores {
Expand Down

0 comments on commit dec665f

Please sign in to comment.