From de8c09886c645ea1e3eacf88dd123e37b98a27db Mon Sep 17 00:00:00 2001 From: luolibin Date: Tue, 20 Aug 2019 17:43:15 +0800 Subject: [PATCH 1/5] add retry logic for update pvc --- pkg/scheduler/predicates/ha.go | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/pkg/scheduler/predicates/ha.go b/pkg/scheduler/predicates/ha.go index 2b71a6e5635..19d65962b98 100644 --- a/pkg/scheduler/predicates/ha.go +++ b/pkg/scheduler/predicates/ha.go @@ -29,9 +29,11 @@ import ( apiv1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/retry" ) type ha struct { @@ -302,7 +304,31 @@ func (h *ha) realPVCListFn(ns, instanceName, component string) (*apiv1.Persisten } func (h *ha) realUpdatePVCFn(pvc *apiv1.PersistentVolumeClaim) error { - _, err := h.kubeCli.CoreV1().PersistentVolumeClaims(pvc.GetNamespace()).Update(pvc) + ns := pvc.GetNamespace() + tcName := pvc.Labels[label.InstanceLabelKey] + pvcName := pvc.GetName() + + labels := pvc.GetLabels() + ann := pvc.GetAnnotations() + err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + _, updateErr := h.kubeCli.CoreV1().PersistentVolumeClaims(ns).Update(pvc) + if updateErr == nil { + glog.Infof("update PVC: [%s/%s] successfully, TidbCluster: %s", ns, pvcName, tcName) + return nil + } + glog.Errorf("failed to update PVC: [%s/%s], TidbCluster: %s, error: %v", ns, pvcName, tcName, updateErr) + + if updated, err := h.pvcGetFn(ns, pvcName); err == nil { + // make a copy so we don't mutate the shared cache + pvc = updated.DeepCopy() + pvc.Labels = labels + pvc.Annotations = ann + } else { + utilruntime.HandleError(fmt.Errorf("error getting updated PVC %s/%s from lister: %v", ns, pvcName, err)) + } + + return updateErr + }) return err } From f2476c56f5b1bc78223098fb9d184cb6e2e88cf1 Mon Sep 17 00:00:00 2001 From: luolibin Date: Tue, 20 Aug 2019 17:45:48 +0800 Subject: [PATCH 2/5] add pvc cleaner for clean the pod-scheduling annotation --- .../tidbcluster/tidb_cluster_control.go | 11 +- .../tidbcluster/tidb_cluster_control_test.go | 3 +- .../tidbcluster/tidb_cluster_controller.go | 5 + .../tidb_cluster_controller_test.go | 1 + pkg/manager/member/pvc_cleaner.go | 152 ++++++++ pkg/manager/member/pvc_cleaner_test.go | 359 ++++++++++++++++++ 6 files changed, 529 insertions(+), 2 deletions(-) create mode 100644 pkg/manager/member/pvc_cleaner.go create mode 100644 pkg/manager/member/pvc_cleaner_test.go diff --git a/pkg/controller/tidbcluster/tidb_cluster_control.go b/pkg/controller/tidbcluster/tidb_cluster_control.go index 8f8da6be684..e7e7cdace5b 100644 --- a/pkg/controller/tidbcluster/tidb_cluster_control.go +++ b/pkg/controller/tidbcluster/tidb_cluster_control.go @@ -41,6 +41,7 @@ func NewDefaultTidbClusterControl( reclaimPolicyManager manager.Manager, metaManager manager.Manager, orphanPodsCleaner member.OrphanPodsCleaner, + pvcCleaner member.PVCCleanerInterface, recorder record.EventRecorder) ControlInterface { return &defaultTidbClusterControl{ tcControl, @@ -50,6 +51,7 @@ func NewDefaultTidbClusterControl( reclaimPolicyManager, metaManager, orphanPodsCleaner, + pvcCleaner, recorder, } } @@ -62,6 +64,7 @@ type defaultTidbClusterControl struct { reclaimPolicyManager manager.Manager metaManager manager.Manager orphanPodsCleaner member.OrphanPodsCleaner + pvcCleaner member.PVCCleanerInterface recorder record.EventRecorder } @@ -138,5 +141,11 @@ func (tcc *defaultTidbClusterControl) updateTidbCluster(tc *v1alpha1.TidbCluster // - label.StoreIDLabelKey // - label.MemberIDLabelKey // - label.NamespaceLabelKey - return tcc.metaManager.Sync(tc) + if err := tcc.metaManager.Sync(tc); err != nil { + return err + } + + // cleaning the pod scheduling annotation for pd and tikv + _, err := tcc.pvcCleaner.Clean(tc) + return err } diff --git a/pkg/controller/tidbcluster/tidb_cluster_control_test.go b/pkg/controller/tidbcluster/tidb_cluster_control_test.go index 3f3c4dbd849..a8061d9cc24 100644 --- a/pkg/controller/tidbcluster/tidb_cluster_control_test.go +++ b/pkg/controller/tidbcluster/tidb_cluster_control_test.go @@ -232,7 +232,8 @@ func newFakeTidbClusterControl() (ControlInterface, *meta.FakeReclaimPolicyManag reclaimPolicyManager := meta.NewFakeReclaimPolicyManager() metaManager := meta.NewFakeMetaManager() opc := mm.NewFakeOrphanPodsCleaner() - control := NewDefaultTidbClusterControl(tcControl, pdMemberManager, tikvMemberManager, tidbMemberManager, reclaimPolicyManager, metaManager, opc, recorder) + pcc := mm.NewFakePVCCleaner() + control := NewDefaultTidbClusterControl(tcControl, pdMemberManager, tikvMemberManager, tidbMemberManager, reclaimPolicyManager, metaManager, opc, pcc, recorder) return control, reclaimPolicyManager, pdMemberManager, tikvMemberManager, tidbMemberManager, metaManager } diff --git a/pkg/controller/tidbcluster/tidb_cluster_controller.go b/pkg/controller/tidbcluster/tidb_cluster_controller.go index 3b971b4e0ed..c0278e5124d 100644 --- a/pkg/controller/tidbcluster/tidb_cluster_controller.go +++ b/pkg/controller/tidbcluster/tidb_cluster_controller.go @@ -171,6 +171,11 @@ func NewController( podControl, pvcInformer.Lister(), ), + mm.NewRealPVCCleaner( + podInformer.Lister(), + pvcControl, + pvcInformer.Lister(), + ), recorder, ), queue: workqueue.NewNamedRateLimitingQueue( diff --git a/pkg/controller/tidbcluster/tidb_cluster_controller_test.go b/pkg/controller/tidbcluster/tidb_cluster_controller_test.go index 2f06163aeaa..8f52f451eaf 100644 --- a/pkg/controller/tidbcluster/tidb_cluster_controller_test.go +++ b/pkg/controller/tidbcluster/tidb_cluster_controller_test.go @@ -324,6 +324,7 @@ func newFakeTidbClusterController() (*Controller, cache.Indexer, cache.Indexer) podControl, ), mm.NewFakeOrphanPodsCleaner(), + mm.NewFakePVCCleaner(), recorder, ) diff --git a/pkg/manager/member/pvc_cleaner.go b/pkg/manager/member/pvc_cleaner.go new file mode 100644 index 00000000000..4ee0e417ccb --- /dev/null +++ b/pkg/manager/member/pvc_cleaner.go @@ -0,0 +1,152 @@ +// Copyright 2019 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package member + +import ( + "fmt" + + "github.com/golang/glog" + "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" + "github.com/pingcap/tidb-operator/pkg/controller" + "github.com/pingcap/tidb-operator/pkg/label" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + corelisters "k8s.io/client-go/listers/core/v1" +) + +const ( + skipReasonPVCCleanerIsNotPDOrTiKV = "pvc cleaner: member type is not pd or tikv" + skipReasonPVCCleanerDeferDeletePVCNotHasLock = "pvc cleaner: defer delete PVC not has schedule lock" + skipReasonPVCCleanerPVCNotHasLock = "pvc cleaner: pvc not has schedule lock" + skipReasonPVCCleanerPodWaitingForScheduling = "pvc cleaner: waiting for pod scheduling" + skipReasonPVCCleanerPodNotFound = "pvc cleaner: the corresponding pod of pvc has not been found" + skipReasonPVCCleanerWaitingForPVCSync = "pvc cleaner: waiting for pvc's meta info to be synced" +) + +// PVCCleaner implements the logic for cleaning the pvc related resource +type PVCCleanerInterface interface { + Clean(*v1alpha1.TidbCluster) (map[string]string, error) +} + +type realPVCCleaner struct { + podLister corelisters.PodLister + pvcControl controller.PVCControlInterface + pvcLister corelisters.PersistentVolumeClaimLister +} + +// NewRealPVCCleaner returns a realPVCCleaner +func NewRealPVCCleaner( + podLister corelisters.PodLister, + pvcControl controller.PVCControlInterface, + pvcLister corelisters.PersistentVolumeClaimLister) PVCCleanerInterface { + return &realPVCCleaner{ + podLister, + pvcControl, + pvcLister, + } +} + +func (rpc *realPVCCleaner) Clean(tc *v1alpha1.TidbCluster) (map[string]string, error) { + ns := tc.GetNamespace() + tcName := tc.GetName() + // for unit test + skipReason := map[string]string{} + + selector, err := label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).Selector() + if err != nil { + return skipReason, fmt.Errorf("cluster %s/%s assemble label selector failed, err: %v", ns, tcName, err) + } + + pvcs, err := rpc.pvcLister.PersistentVolumeClaims(ns).List(selector) + if err != nil { + return skipReason, fmt.Errorf("cluster %s/%s list pvc failed, selector: %s, err: %v", ns, tcName, selector, err) + } + + for _, pvc := range pvcs { + pvcName := pvc.GetName() + l := label.Label(pvc.Labels) + if !(l.IsPD() || l.IsTiKV()) { + skipReason[pvcName] = skipReasonPVCCleanerIsNotPDOrTiKV + continue + } + + if pvc.Annotations[label.AnnPVCDeferDeleting] != "" { + if _, exist := pvc.Annotations[label.AnnPVCPodScheduling]; !exist { + // The defer deleting PVC has not pod scheduling annotation, do nothing + glog.V(4).Infof("cluster %s/%s defer delete pvc %s has not pod scheduling annotation, skip clean", ns, tcName, pvcName) + skipReason[pvcName] = skipReasonPVCCleanerDeferDeletePVCNotHasLock + continue + } + // The defer deleting PVC has pod scheduling annotation, so we need to delete the pod scheduling annotation + delete(pvc.Annotations, label.AnnPVCPodScheduling) + if _, err := rpc.pvcControl.UpdatePVC(tc, pvc); err != nil { + return skipReason, fmt.Errorf("cluster %s/%s remove pvc %s pod scheduling annotation faild, err: %v", ns, tcName, pvcName, err) + } + continue + } + + podName, exist := pvc.Annotations[label.AnnPodNameKey] + if !exist { + // waiting for pvc's meta info to be synced + skipReason[pvcName] = skipReasonPVCCleanerWaitingForPVCSync + continue + } + + pod, err := rpc.podLister.Pods(ns).Get(podName) + if err != nil { + if !errors.IsNotFound(err) { + return skipReason, fmt.Errorf("cluster %s/%s get pvc %s pod %s failed, err: %v", ns, tcName, pvcName, podName, err) + } + skipReason[pvcName] = skipReasonPVCCleanerPodNotFound + continue + } + + if _, exist := pvc.Annotations[label.AnnPVCPodScheduling]; !exist { + // The PVC has not pod scheduling annotation, do nothing + glog.V(4).Infof("cluster %s/%s pvc %s has not pod scheduling annotation, skip clean", ns, tcName, pvcName) + skipReason[pvcName] = skipReasonPVCCleanerPVCNotHasLock + continue + } + + if pvc.Status.Phase != corev1.ClaimBound || pod.Spec.NodeName == "" { + // This pod has not been scheduled yet, no need to clean up the pvc pod schedule annotation + glog.V(4).Infof("cluster %s/%s pod %s has not been scheduled yet, skip clean pvc %s pod schedule annotation", ns, tcName, podName, pvcName) + skipReason[pvcName] = skipReasonPVCCleanerPodWaitingForScheduling + continue + } + + delete(pvc.Annotations, label.AnnPVCPodScheduling) + if _, err := rpc.pvcControl.UpdatePVC(tc, pvc); err != nil { + return skipReason, fmt.Errorf("cluster %s/%s remove pvc %s pod scheduling annotation faild, err: %v", ns, tcName, pvcName, err) + } + glog.Infof("cluster %s/%s, clean pvc %s pod scheduling annotation successfully", ns, tcName, pvcName) + } + + return skipReason, nil +} + +var _ PVCCleanerInterface = &realPVCCleaner{} + +type fakePVCCleaner struct{} + +// NewFakePVCCleaner returns a fake PVC cleaner +func NewFakePVCCleaner() PVCCleanerInterface { + return &fakePVCCleaner{} +} + +func (fpc *fakePVCCleaner) Clean(_ *v1alpha1.TidbCluster) (map[string]string, error) { + return nil, nil +} + +var _ PVCCleanerInterface = &fakePVCCleaner{} diff --git a/pkg/manager/member/pvc_cleaner_test.go b/pkg/manager/member/pvc_cleaner_test.go new file mode 100644 index 00000000000..e30f77efe6e --- /dev/null +++ b/pkg/manager/member/pvc_cleaner_test.go @@ -0,0 +1,359 @@ +// Copyright 2018 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package member + +import ( + "fmt" + "strings" + "testing" + + . "github.com/onsi/gomega" + "github.com/pingcap/tidb-operator/pkg/controller" + "github.com/pingcap/tidb-operator/pkg/label" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubeinformers "k8s.io/client-go/informers" + kubefake "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/tools/cache" +) + +func TestPVCCleanerClean(t *testing.T) { + g := NewGomegaWithT(t) + + tc := newTidbClusterForPD() + type testcase struct { + name string + pods []*corev1.Pod + pvcs []*corev1.PersistentVolumeClaim + updatePVCFailed bool + expectFn func(*GomegaWithT, map[string]string, *realPVCCleaner, error) + } + testFn := func(test *testcase, t *testing.T) { + t.Log(test.name) + + pcc, podIndexer, pvcIndexer, pvcControl := newFakePVCCleaner() + if test.pods != nil { + for _, pod := range test.pods { + podIndexer.Add(pod) + } + } + if test.pvcs != nil { + for _, pvc := range test.pvcs { + pvcIndexer.Add(pvc) + } + } + if test.updatePVCFailed { + pvcControl.SetUpdatePVCError(fmt.Errorf("update PVC failed"), 0) + } + + skipReason, err := pcc.Clean(tc) + test.expectFn(g, skipReason, pcc, err) + } + tests := []testcase{ + { + name: "no pvcs", + pods: []*corev1.Pod{}, + pvcs: nil, + expectFn: func(g *GomegaWithT, skipReason map[string]string, _ *realPVCCleaner, err error) { + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(len(skipReason)).To(Equal(0)) + }, + }, + { + name: "not pd or tikv pvcs", + pods: nil, + pvcs: []*corev1.PersistentVolumeClaim{ + { + TypeMeta: metav1.TypeMeta{Kind: "PersistentVolumeClaim", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "tidb-test-tidb-0", + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).TiDB().Labels(), + }, + }, + }, + expectFn: func(g *GomegaWithT, skipReason map[string]string, _ *realPVCCleaner, err error) { + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(len(skipReason)).To(Equal(1)) + g.Expect(skipReason["tidb-test-tidb-0"]).To(Equal(skipReasonPVCCleanerIsNotPDOrTiKV)) + }, + }, + { + name: "defer delete pvc that not has the schedule lock", + pods: nil, + pvcs: []*corev1.PersistentVolumeClaim{ + { + TypeMeta: metav1.TypeMeta{Kind: "PersistentVolumeClaim", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "pd-test-pd-0", + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + Annotations: map[string]string{label.AnnPVCDeferDeleting: "true"}, + }, + }, + }, + expectFn: func(g *GomegaWithT, skipReason map[string]string, _ *realPVCCleaner, err error) { + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(len(skipReason)).To(Equal(1)) + g.Expect(skipReason["pd-test-pd-0"]).To(Equal(skipReasonPVCCleanerDeferDeletePVCNotHasLock)) + }, + }, + { + name: "defer delete pvc that has the schedule lock but update pvc failed", + pods: nil, + pvcs: []*corev1.PersistentVolumeClaim{ + { + TypeMeta: metav1.TypeMeta{Kind: "PersistentVolumeClaim", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "pd-test-pd-0", + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + Annotations: map[string]string{ + label.AnnPVCDeferDeleting: "true", + label.AnnPVCPodScheduling: "true", + }, + }, + }, + }, + updatePVCFailed: true, + expectFn: func(g *GomegaWithT, skipReason map[string]string, _ *realPVCCleaner, err error) { + g.Expect(len(skipReason)).To(Equal(0)) + g.Expect(err).To(HaveOccurred()) + g.Expect(strings.Contains(err.Error(), "update PVC failed")).To(BeTrue()) + }, + }, + { + name: "defer delete pvc that has the schedule lock and update pvc success", + pods: nil, + pvcs: []*corev1.PersistentVolumeClaim{ + { + TypeMeta: metav1.TypeMeta{Kind: "PersistentVolumeClaim", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "pd-test-pd-0", + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + Annotations: map[string]string{ + label.AnnPVCDeferDeleting: "true", + label.AnnPVCPodScheduling: "true", + }, + }, + }, + }, + expectFn: func(g *GomegaWithT, skipReason map[string]string, pcc *realPVCCleaner, err error) { + g.Expect(len(skipReason)).To(Equal(0)) + g.Expect(err).NotTo(HaveOccurred()) + pvc, err := pcc.pvcLister.PersistentVolumeClaims(metav1.NamespaceDefault).Get("pd-test-pd-0") + g.Expect(err).NotTo(HaveOccurred()) + _, exist := pvc.Labels[label.AnnPVCPodScheduling] + g.Expect(exist).To(BeFalse()) + }, + }, + { + name: "pvc's meta info has not to be synced", + pods: nil, + pvcs: []*corev1.PersistentVolumeClaim{ + { + TypeMeta: metav1.TypeMeta{Kind: "PersistentVolumeClaim", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "pd-test-pd-0", + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + Annotations: map[string]string{ + label.AnnPVCPodScheduling: "true", + }, + }, + }, + }, + expectFn: func(g *GomegaWithT, skipReason map[string]string, _ *realPVCCleaner, err error) { + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(len(skipReason)).To(Equal(1)) + g.Expect(skipReason["pd-test-pd-0"]).To(Equal(skipReasonPVCCleanerWaitingForPVCSync)) + }, + }, + { + name: "the corresponding pod of pvc has not been found", + pods: nil, + pvcs: []*corev1.PersistentVolumeClaim{ + { + TypeMeta: metav1.TypeMeta{Kind: "PersistentVolumeClaim", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "pd-test-pd-0", + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + Annotations: map[string]string{ + label.AnnPVCPodScheduling: "true", + label.AnnPodNameKey: "test-pd-0", + }, + }, + }, + }, + expectFn: func(g *GomegaWithT, skipReason map[string]string, _ *realPVCCleaner, err error) { + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(len(skipReason)).To(Equal(1)) + g.Expect(skipReason["pd-test-pd-0"]).To(Equal(skipReasonPVCCleanerPodNotFound)) + }, + }, + { + name: "pvc that not has the schedule lock", + pods: []*corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pd-0", + Namespace: metav1.NamespaceDefault, + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + }, + }, + }, + pvcs: []*corev1.PersistentVolumeClaim{ + { + TypeMeta: metav1.TypeMeta{Kind: "PersistentVolumeClaim", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "pd-test-pd-0", + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + Annotations: map[string]string{ + label.AnnPodNameKey: "test-pd-0", + }, + }, + Status: corev1.PersistentVolumeClaimStatus{Phase: corev1.ClaimBound}, + }, + }, + expectFn: func(g *GomegaWithT, skipReason map[string]string, _ *realPVCCleaner, err error) { + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(len(skipReason)).To(Equal(1)) + g.Expect(skipReason["pd-test-pd-0"]).To(Equal(skipReasonPVCCleanerPVCNotHasLock)) + }, + }, + { + name: "waiting for pod scheduling", + pods: []*corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pd-0", + Namespace: metav1.NamespaceDefault, + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + }, + }, + }, + pvcs: []*corev1.PersistentVolumeClaim{ + { + TypeMeta: metav1.TypeMeta{Kind: "PersistentVolumeClaim", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "pd-test-pd-0", + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + Annotations: map[string]string{ + label.AnnPVCPodScheduling: "true", + label.AnnPodNameKey: "test-pd-0", + }, + }, + Status: corev1.PersistentVolumeClaimStatus{Phase: corev1.ClaimBound}, + }, + }, + expectFn: func(g *GomegaWithT, skipReason map[string]string, _ *realPVCCleaner, err error) { + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(len(skipReason)).To(Equal(1)) + g.Expect(skipReason["pd-test-pd-0"]).To(Equal(skipReasonPVCCleanerPodWaitingForScheduling)) + }, + }, + { + name: "pvc that need to remove the schedule lock but update pvc failed", + pods: []*corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pd-0", + Namespace: metav1.NamespaceDefault, + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + }, + Spec: corev1.PodSpec{ + NodeName: "node-1", + }, + }, + }, + pvcs: []*corev1.PersistentVolumeClaim{ + { + TypeMeta: metav1.TypeMeta{Kind: "PersistentVolumeClaim", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "pd-test-pd-0", + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + Annotations: map[string]string{ + label.AnnPVCPodScheduling: "true", + label.AnnPodNameKey: "test-pd-0", + }, + }, + Status: corev1.PersistentVolumeClaimStatus{Phase: corev1.ClaimBound}, + }, + }, + updatePVCFailed: true, + expectFn: func(g *GomegaWithT, skipReason map[string]string, _ *realPVCCleaner, err error) { + g.Expect(len(skipReason)).To(Equal(0)) + g.Expect(err).To(HaveOccurred()) + g.Expect(strings.Contains(err.Error(), "update PVC failed")).To(BeTrue()) + }, + }, + { + name: "pvc that need to remove the schedule lock and update pvc success", + pods: []*corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pd-0", + Namespace: metav1.NamespaceDefault, + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + }, + Spec: corev1.PodSpec{ + NodeName: "node-1", + }, + }, + }, + pvcs: []*corev1.PersistentVolumeClaim{ + { + TypeMeta: metav1.TypeMeta{Kind: "PersistentVolumeClaim", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "pd-test-pd-0", + Labels: label.New().Instance(tc.GetLabels()[label.InstanceLabelKey]).PD().Labels(), + Annotations: map[string]string{ + label.AnnPVCPodScheduling: "true", + label.AnnPodNameKey: "test-pd-0", + }, + }, + Status: corev1.PersistentVolumeClaimStatus{Phase: corev1.ClaimBound}, + }, + }, + expectFn: func(g *GomegaWithT, skipReason map[string]string, pcc *realPVCCleaner, err error) { + g.Expect(len(skipReason)).To(Equal(0)) + g.Expect(err).NotTo(HaveOccurred()) + pvc, err := pcc.pvcLister.PersistentVolumeClaims(metav1.NamespaceDefault).Get("pd-test-pd-0") + g.Expect(err).NotTo(HaveOccurred()) + _, exist := pvc.Labels[label.AnnPVCPodScheduling] + g.Expect(exist).To(BeFalse()) + }, + }, + } + for i := range tests { + testFn(&tests[i], t) + } +} + +func newFakePVCCleaner() (*realPVCCleaner, cache.Indexer, cache.Indexer, *controller.FakePVCControl) { + kubeCli := kubefake.NewSimpleClientset() + kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kubeCli, 0) + podInformer := kubeInformerFactory.Core().V1().Pods() + pvcInformer := kubeInformerFactory.Core().V1().PersistentVolumeClaims() + pvcControl := controller.NewFakePVCControl(pvcInformer) + + return &realPVCCleaner{podInformer.Lister(), pvcControl, pvcInformer.Lister()}, + podInformer.Informer().GetIndexer(), pvcInformer.Informer().GetIndexer(), pvcControl +} From 1adb0b47a52aaee074b216bdd1acf84ad1450ac5 Mon Sep 17 00:00:00 2001 From: luolibin Date: Wed, 21 Aug 2019 00:26:37 +0800 Subject: [PATCH 3/5] add e2e test --- tests/actions.go | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/actions.go b/tests/actions.go index 7db94ae06fa..7aec1986079 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -770,6 +770,11 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error return false, nil } + glog.V(4).Infof("check all pd and tikv instances have not pod scheduling annotation") + if b, err := oa.podsScheduleAnnHaveDeleted(tc); !b && err == nil { + return false, nil + } + glog.V(4).Infof("check store labels") if b, err := oa.storeLabelsIsSet(tc, info.TopologyKey); !b && err == nil { return false, nil @@ -1515,6 +1520,37 @@ func (oa *operatorActions) schedulerHAFn(tc *v1alpha1.TidbCluster) (bool, error) return true, nil } +func (oa *operatorActions) podsScheduleAnnHaveDeleted(tc *v1alpha1.TidbCluster) (bool, error) { + ns := tc.GetNamespace() + tcName := tc.GetName() + + listOptions := metav1.ListOptions{ + LabelSelector: labels.SelectorFromSet( + label.New().Instance(tcName).Labels()).String(), + } + + pvcList, err := oa.kubeCli.CoreV1().PersistentVolumeClaims(ns).List(listOptions) + if err != nil { + glog.Errorf("failed to list pvcs for tidb cluster %s/%s, err: %v", ns, tcName, err) + return false, nil + } + + for _, pvc := range pvcList.Items { + pvcName := pvc.GetName() + l := label.Label(pvc.Labels) + if !(l.IsPD() || l.IsTiKV()) { + continue + } + + if _, exist := pvc.Annotations[label.AnnPVCPodScheduling]; exist { + glog.Errorf("tidb cluster %s/%s pvc %s has pod scheduling annotation", ns, tcName, pvcName) + return false, nil + } + } + + return true, nil +} + func (oa *operatorActions) storeLabelsIsSet(tc *v1alpha1.TidbCluster, topologyKey string) (bool, error) { pdCli := controller.GetPDClient(oa.pdControl, tc) for _, store := range tc.Status.TiKV.Stores { From 1702a4fbd61b20aed8910eebe41ae88049a15377 Mon Sep 17 00:00:00 2001 From: luolibin Date: Thu, 22 Aug 2019 13:45:59 +0800 Subject: [PATCH 4/5] address comments --- pkg/manager/member/pvc_cleaner.go | 4 ++-- pkg/scheduler/predicates/ha.go | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/manager/member/pvc_cleaner.go b/pkg/manager/member/pvc_cleaner.go index 4ee0e417ccb..b9f4b78cf1b 100644 --- a/pkg/manager/member/pvc_cleaner.go +++ b/pkg/manager/member/pvc_cleaner.go @@ -83,7 +83,7 @@ func (rpc *realPVCCleaner) Clean(tc *v1alpha1.TidbCluster) (map[string]string, e if pvc.Annotations[label.AnnPVCDeferDeleting] != "" { if _, exist := pvc.Annotations[label.AnnPVCPodScheduling]; !exist { - // The defer deleting PVC has not pod scheduling annotation, do nothing + // The defer deleting PVC without pod scheduling annotation, do nothing glog.V(4).Infof("cluster %s/%s defer delete pvc %s has not pod scheduling annotation, skip clean", ns, tcName, pvcName) skipReason[pvcName] = skipReasonPVCCleanerDeferDeletePVCNotHasLock continue @@ -113,7 +113,7 @@ func (rpc *realPVCCleaner) Clean(tc *v1alpha1.TidbCluster) (map[string]string, e } if _, exist := pvc.Annotations[label.AnnPVCPodScheduling]; !exist { - // The PVC has not pod scheduling annotation, do nothing + // The PVC without pod scheduling annotation, do nothing glog.V(4).Infof("cluster %s/%s pvc %s has not pod scheduling annotation, skip clean", ns, tcName, pvcName) skipReason[pvcName] = skipReasonPVCCleanerPVCNotHasLock continue diff --git a/pkg/scheduler/predicates/ha.go b/pkg/scheduler/predicates/ha.go index 19d65962b98..ea0603d3db9 100644 --- a/pkg/scheduler/predicates/ha.go +++ b/pkg/scheduler/predicates/ha.go @@ -310,7 +310,7 @@ func (h *ha) realUpdatePVCFn(pvc *apiv1.PersistentVolumeClaim) error { labels := pvc.GetLabels() ann := pvc.GetAnnotations() - err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + return retry.RetryOnConflict(retry.DefaultBackoff, func() error { _, updateErr := h.kubeCli.CoreV1().PersistentVolumeClaims(ns).Update(pvc) if updateErr == nil { glog.Infof("update PVC: [%s/%s] successfully, TidbCluster: %s", ns, pvcName, tcName) @@ -329,7 +329,6 @@ func (h *ha) realUpdatePVCFn(pvc *apiv1.PersistentVolumeClaim) error { return updateErr }) - return err } func (h *ha) realPVCGetFn(ns, pvcName string) (*apiv1.PersistentVolumeClaim, error) { From af3c451384068f74ddfb8f670d34eed02f37585c Mon Sep 17 00:00:00 2001 From: luolibin Date: Tue, 3 Sep 2019 23:01:53 +0800 Subject: [PATCH 5/5] refine code --- pkg/scheduler/predicates/ha.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/scheduler/predicates/ha.go b/pkg/scheduler/predicates/ha.go index ea0603d3db9..3a4934be67f 100644 --- a/pkg/scheduler/predicates/ha.go +++ b/pkg/scheduler/predicates/ha.go @@ -269,7 +269,7 @@ func (h *ha) realAcquireLock(pod *apiv1.Pod) (*apiv1.PersistentVolumeClaim, *api return schedulingPVC, currentPVC, err } if schedulingPVC.Status.Phase != apiv1.ClaimBound || schedulingPod.Spec.NodeName == "" { - return schedulingPVC, currentPVC, fmt.Errorf("waiting for Pod %s/%s scheduling", ns, strings.TrimPrefix(schedulingPVC.GetName(), component+"-")) + return schedulingPVC, currentPVC, fmt.Errorf("waiting for Pod %s/%s scheduling", ns, schedulingPodName) } }