Skip to content

Commit

Permalink
Merge branch 'support_auto_scaling_status' of https://github.com/Yisa…
Browse files Browse the repository at this point in the history
…er/tidb-operator into support_auto_scaling_status
  • Loading branch information
Song Gao committed Apr 15, 2020
2 parents d5ba990 + 45bbbad commit 64d704f
Show file tree
Hide file tree
Showing 10 changed files with 140 additions and 84 deletions.
13 changes: 13 additions & 0 deletions docs/api-references/docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -5893,6 +5893,19 @@ Optional: Defaults to <code>.spec.services</code> in favor of backward compatibi
</tr>
<tr>
<td>
<code>maxFailoverCount</code></br>
<em>
int32
</em>
</td>
<td>
<em>(Optional)</em>
<p>MaxFailoverCount limit the max replicas could be added in failover, 0 means no failover.
Optional: Defaults to 3</p>
</td>
</tr>
<tr>
<td>
<code>storageClassName</code></br>
<em>
string
Expand Down
5 changes: 5 additions & 0 deletions manifests/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1818,6 +1818,11 @@ spec:
description: 'Limits describes the maximum amount of compute resources
allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/'
type: object
maxFailoverCount:
description: 'MaxFailoverCount limit the max replicas could be added
in failover, 0 means no failover. Optional: Defaults to 3'
format: int32
type: integer
nodeSelector:
description: 'NodeSelector of the component. Merged into the cluster-level
nodeSelector if non-empty Optional: Defaults to cluster-level
Expand Down
3 changes: 3 additions & 0 deletions pkg/apis/pingcap/v1alpha1/defaulting/tidbcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ func setPdSpecDefault(tc *v1alpha1.TidbCluster) {
tc.Spec.PD.BaseImage = defaultPDImage
}
}
if tc.Spec.PD.MaxFailoverCount == nil {
tc.Spec.PD.MaxFailoverCount = pointer.Int32Ptr(3)
}
}

func setPumpSpecDefault(tc *v1alpha1.TidbCluster) {
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/pingcap/v1alpha1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions pkg/apis/pingcap/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,12 @@ type PDSpec struct {
// +optional
Service *ServiceSpec `json:"service,omitempty"`

// MaxFailoverCount limit the max replicas could be added in failover, 0 means no failover.
// Optional: Defaults to 3
// +kubebuilder:validation:Minimum=0
// +optional
MaxFailoverCount *int32 `json:"maxFailoverCount,omitempty"`

// The storageClassName of the persistent volume for PD data storage.
// Defaults to Kubernetes default storage class.
// +optional
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion pkg/manager/member/pd_failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ import (
"k8s.io/klog"
)

// TODO add maxFailoverCount
type pdFailover struct {
cli versioned.Interface
pdControl pdapi.PDControlInterface
Expand Down Expand Up @@ -93,6 +92,12 @@ func (pf *pdFailover) Failover(tc *v1alpha1.TidbCluster) error {
ns, tcName, healthCount, tc.PDStsDesiredReplicas(), tc.Spec.PD.Replicas, len(tc.Status.PD.FailureMembers))
}

failureReplicas := getFailureReplicas(tc)
if failureReplicas >= int(*tc.Spec.PD.MaxFailoverCount) {
klog.Errorf("PD failover replicas (%d) reaches the limit (%d), skip failover", failureReplicas, *tc.Spec.PD.MaxFailoverCount)
return nil
}

notDeletedCount := 0
for _, pdMember := range tc.Status.PD.FailureMembers {
if !pdMember.MemberDeleted {
Expand Down
125 changes: 81 additions & 44 deletions pkg/manager/member/pd_failover_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
kubefake "k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/utils/pointer"
)

func TestPDFailoverFailover(t *testing.T) {
Expand All @@ -42,6 +43,7 @@ func TestPDFailoverFailover(t *testing.T) {
type testcase struct {
name string
update func(*v1alpha1.TidbCluster)
maxFailoverCount int32
hasPVC bool
hasPod bool
podWithDeletionTimestamp bool
Expand All @@ -53,53 +55,12 @@ func TestPDFailoverFailover(t *testing.T) {
errExpectFn func(*GomegaWithT, error)
expectFn func(*v1alpha1.TidbCluster, *pdFailover)
}
testFn := func(test *testcase, t *testing.T) {
t.Log(test.name)
tc := newTidbClusterForPD()
test.update(tc)

pdFailover, pvcIndexer, podIndexer, fakePDControl, fakePodControl, fakePVCControl := newFakePDFailover()
pdClient := controller.NewFakePDClient(fakePDControl, tc)
pdFailover.recorder = recorder

pdClient.AddReaction(pdapi.DeleteMemberByIDActionType, func(action *pdapi.Action) (interface{}, error) {
if test.delMemberFailed {
return nil, fmt.Errorf("failed to delete member")
}
return nil, nil
})

if test.hasPVC {
pvc := newPVCForPDFailover(tc, v1alpha1.PDMemberType, 1)
if test.pvcWithDeletionTimestamp {
pvc.DeletionTimestamp = &metav1.Time{Time: time.Now()}
}
pvcIndexer.Add(pvc)
}
if test.hasPod {
pod := newPodForPDFailover(tc, v1alpha1.PDMemberType, 1)
if test.podWithDeletionTimestamp {
pod.DeletionTimestamp = &metav1.Time{Time: time.Now()}
}
podIndexer.Add(pod)
}
if test.delPodFailed {
fakePodControl.SetDeletePodError(errors.NewInternalError(fmt.Errorf("delete pod: API server failed")), 0)
}
if test.delPVCFailed {
fakePVCControl.SetDeletePVCError(errors.NewInternalError(fmt.Errorf("delete pvc: API server failed")), 0)
}

tc.Status.PD.Synced = !test.statusSyncFailed

err := pdFailover.Failover(tc)
test.errExpectFn(g, err)
test.expectFn(tc, pdFailover)
}
tests := []testcase{
{
name: "all members are ready",
update: allMembersReady,
maxFailoverCount: 3,
hasPVC: true,
hasPod: true,
podWithDeletionTimestamp: false,
Expand All @@ -118,6 +79,7 @@ func TestPDFailoverFailover(t *testing.T) {
{
name: "pd status sync failed",
update: allMembersReady,
maxFailoverCount: 3,
hasPVC: true,
hasPod: true,
podWithDeletionTimestamp: false,
Expand All @@ -135,6 +97,7 @@ func TestPDFailoverFailover(t *testing.T) {
{
name: "two members are not ready, not in quorum",
update: twoMembersNotReady,
maxFailoverCount: 3,
hasPVC: true,
hasPod: true,
podWithDeletionTimestamp: false,
Expand All @@ -159,6 +122,7 @@ func TestPDFailoverFailover(t *testing.T) {
{
name: "two members are ready and a failure member",
update: oneFailureMember,
maxFailoverCount: 3,
hasPVC: true,
hasPod: true,
podWithDeletionTimestamp: false,
Expand Down Expand Up @@ -187,6 +151,7 @@ func TestPDFailoverFailover(t *testing.T) {
pd1.LastTransitionTime = metav1.Time{Time: time.Now().Add(-2 * time.Minute)}
tc.Status.PD.Members[pd1Name] = pd1
},
maxFailoverCount: 3,
hasPVC: true,
hasPod: true,
podWithDeletionTimestamp: false,
Expand All @@ -212,6 +177,7 @@ func TestPDFailoverFailover(t *testing.T) {
pd1.LastTransitionTime = metav1.Time{}
tc.Status.PD.Members[pd1Name] = pd1
},
maxFailoverCount: 3,
hasPVC: true,
hasPod: true,
podWithDeletionTimestamp: false,
Expand All @@ -231,6 +197,7 @@ func TestPDFailoverFailover(t *testing.T) {
{
name: "has one not ready member, don't have pvc",
update: oneNotReadyMember,
maxFailoverCount: 3,
hasPVC: false,
hasPod: true,
podWithDeletionTimestamp: false,
Expand All @@ -253,6 +220,7 @@ func TestPDFailoverFailover(t *testing.T) {
{
name: "has one not ready member",
update: oneNotReadyMember,
maxFailoverCount: 3,
hasPVC: true,
hasPod: true,
podWithDeletionTimestamp: false,
Expand All @@ -278,9 +246,30 @@ func TestPDFailoverFailover(t *testing.T) {
g.Expect(events[1]).To(ContainSubstring("Unhealthy pd pod[test-pd-1] is unhealthy, msg:pd member[12891273174085095651] is unhealthy"))
},
},
{
name: "has one not ready member but maxFailoverCount is 0",
update: oneNotReadyMember,
maxFailoverCount: 0,
hasPVC: true,
hasPod: true,
podWithDeletionTimestamp: false,
delMemberFailed: false,
delPodFailed: false,
delPVCFailed: false,
statusSyncFailed: false,
errExpectFn: errExpectNil,
expectFn: func(tc *v1alpha1.TidbCluster, _ *pdFailover) {
g.Expect(int(tc.Spec.PD.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.PD.FailureMembers)).To(Equal(0))
events := collectEvents(recorder.Events)
g.Expect(events).To(HaveLen(1))
g.Expect(events[0]).To(ContainSubstring("test-pd-1(12891273174085095651) is unhealthy"))
},
},
{
name: "has one not ready member, and exceed deadline, don't have PVC, has Pod, delete pod success",
update: oneNotReadyMemberAndAFailureMember,
maxFailoverCount: 3,
hasPVC: false,
hasPod: true,
podWithDeletionTimestamp: false,
Expand Down Expand Up @@ -310,6 +299,7 @@ func TestPDFailoverFailover(t *testing.T) {
pd1.MemberID = "wrong-id"
tc.Status.PD.FailureMembers[pd1Name] = pd1
},
maxFailoverCount: 3,
hasPVC: false,
hasPod: true,
podWithDeletionTimestamp: false,
Expand All @@ -335,6 +325,7 @@ func TestPDFailoverFailover(t *testing.T) {
{
name: "has one not ready member, and exceed deadline, don't have PVC, has Pod, delete member failed",
update: oneNotReadyMemberAndAFailureMember,
maxFailoverCount: 3,
hasPVC: false,
hasPod: true,
podWithDeletionTimestamp: false,
Expand All @@ -360,6 +351,7 @@ func TestPDFailoverFailover(t *testing.T) {
{
name: "has one not ready member, and exceed deadline, don't have PVC, has Pod, delete pod failed",
update: oneNotReadyMemberAndAFailureMember,
maxFailoverCount: 3,
hasPVC: false,
hasPod: true,
podWithDeletionTimestamp: false,
Expand All @@ -386,6 +378,7 @@ func TestPDFailoverFailover(t *testing.T) {
{
name: "has one not ready member, and exceed deadline, has Pod, delete pvc failed",
update: oneNotReadyMemberAndAFailureMember,
maxFailoverCount: 3,
hasPVC: true,
hasPod: true,
podWithDeletionTimestamp: false,
Expand All @@ -412,6 +405,7 @@ func TestPDFailoverFailover(t *testing.T) {
{
name: "has one not ready member, and exceed deadline, has Pod with deletion timestamp",
update: oneNotReadyMemberAndAFailureMember,
maxFailoverCount: 3,
hasPVC: true,
hasPod: true,
podWithDeletionTimestamp: true,
Expand Down Expand Up @@ -441,6 +435,7 @@ func TestPDFailoverFailover(t *testing.T) {
{
name: "has one not ready member, and exceed deadline, has PVC with deletion timestamp",
update: oneNotReadyMemberAndAFailureMember,
maxFailoverCount: 3,
hasPVC: true,
hasPod: true,
podWithDeletionTimestamp: false,
Expand Down Expand Up @@ -470,8 +465,50 @@ func TestPDFailoverFailover(t *testing.T) {
},
}

for i := range tests {
testFn(&tests[i], t)
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
tc := newTidbClusterForPD()
tc.Spec.PD.MaxFailoverCount = pointer.Int32Ptr(test.maxFailoverCount)
test.update(tc)

pdFailover, pvcIndexer, podIndexer, fakePDControl, fakePodControl, fakePVCControl := newFakePDFailover()
pdClient := controller.NewFakePDClient(fakePDControl, tc)
pdFailover.recorder = recorder

pdClient.AddReaction(pdapi.DeleteMemberByIDActionType, func(action *pdapi.Action) (interface{}, error) {
if test.delMemberFailed {
return nil, fmt.Errorf("failed to delete member")
}
return nil, nil
})

if test.hasPVC {
pvc := newPVCForPDFailover(tc, v1alpha1.PDMemberType, 1)
if test.pvcWithDeletionTimestamp {
pvc.DeletionTimestamp = &metav1.Time{Time: time.Now()}
}
pvcIndexer.Add(pvc)
}
if test.hasPod {
pod := newPodForPDFailover(tc, v1alpha1.PDMemberType, 1)
if test.podWithDeletionTimestamp {
pod.DeletionTimestamp = &metav1.Time{Time: time.Now()}
}
podIndexer.Add(pod)
}
if test.delPodFailed {
fakePodControl.SetDeletePodError(errors.NewInternalError(fmt.Errorf("delete pod: API server failed")), 0)
}
if test.delPVCFailed {
fakePVCControl.SetDeletePVCError(errors.NewInternalError(fmt.Errorf("delete pvc: API server failed")), 0)
}

tc.Status.PD.Synced = !test.statusSyncFailed

err := pdFailover.Failover(tc)
test.errExpectFn(g, err)
test.expectFn(tc, pdFailover)
})
}
}

Expand Down
17 changes: 11 additions & 6 deletions pkg/manager/member/pd_member_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,16 @@ func (pmm *pdMemberManager) pdStatefulSetIsUpgrading(set *apps.StatefulSet, tc *
return false, nil
}

func getFailureReplicas(tc *v1alpha1.TidbCluster) int {
failureReplicas := 0
for _, failureMember := range tc.Status.PD.FailureMembers {
if failureMember.MemberDeleted {
failureReplicas++
}
}
return failureReplicas
}

func getNewPDSetForTidbCluster(tc *v1alpha1.TidbCluster, cm *corev1.ConfigMap) (*apps.StatefulSet, error) {
ns := tc.Namespace
tcName := tc.Name
Expand Down Expand Up @@ -568,12 +578,7 @@ func getNewPDSetForTidbCluster(tc *v1alpha1.TidbCluster, cm *corev1.ConfigMap) (
setName := controller.PDMemberName(tcName)
podAnnotations := CombineAnnotations(controller.AnnProm(2379), basePDSpec.Annotations())
stsAnnotations := getStsAnnotations(tc, label.PDLabelVal)
failureReplicas := 0
for _, failureMember := range tc.Status.PD.FailureMembers {
if failureMember.MemberDeleted {
failureReplicas++
}
}
failureReplicas := getFailureReplicas(tc)

pdContainer := corev1.Container{
Name: v1alpha1.PDMemberType.String(),
Expand Down
Loading

0 comments on commit 64d704f

Please sign in to comment.