From 77b92f415b88ec30b59319243c824fa75988630c Mon Sep 17 00:00:00 2001 From: Yecheng Fu Date: Wed, 5 Feb 2020 19:08:20 +0800 Subject: [PATCH 1/4] don't limit nodes and reduce parallel nodes (#1637) Signed-off-by: Yecheng Fu --- ci/pingcap_tidb_operator_build_kind.groovy | 32 ++++++++++------------ 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/ci/pingcap_tidb_operator_build_kind.groovy b/ci/pingcap_tidb_operator_build_kind.groovy index 3efc39de0f0..062072bf6ae 100644 --- a/ci/pingcap_tidb_operator_build_kind.groovy +++ b/ci/pingcap_tidb_operator_build_kind.groovy @@ -60,21 +60,19 @@ spec: emptyDir: {} - name: docker-graph emptyDir: {} - # we limit nodes to run to avoid some issues we found in our cluster, e.g. - # https://github.com/pingcap/tidb-operator/issues/1603 affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.io/hostname - operator: In - values: - - 172.16.5.64 - - 172.16.5.65 - - 172.16.5.67 - - 172.16.5.68 - - 172.16.5.70 + #nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: kubernetes.io/hostname + # operator: In + # values: + # - 172.16.5.64 + # - 172.16.5.65 + # - 172.16.5.67 + # - 172.16.5.68 + # - 172.16.5.70 podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 @@ -225,13 +223,13 @@ def call(BUILD_BRANCH, CREDENTIALS_ID, CODECOV_CREDENTIALS_ID) { def MIRRORS = "DOCKER_IO_MIRROR=http://172.16.4.143:5000 QUAY_IO_MIRROR=http://172.16.4.143:5001" def builds = [:] builds["E2E v1.12.10"] = { - build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=8 KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_ ./hack/e2e.sh -- --preload-images --ginkgo.skip='\\[Serial\\]'", artifacts) + build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=6 KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_ ./hack/e2e.sh -- --preload-images --ginkgo.skip='\\[Serial\\]'", artifacts) } builds["E2E v1.12.10 AdvancedStatefulSet"] = { - build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=8 KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_advanced_statefulset ./hack/e2e.sh -- --preload-images --ginkgo.skip='\\[Serial\\]' --operator-features AdvancedStatefulSet=true", artifacts) + build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=6 KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_advanced_statefulset ./hack/e2e.sh -- --preload-images --ginkgo.skip='\\[Serial\\]' --operator-features AdvancedStatefulSet=true", artifacts) } builds["E2E v1.17.0"] = { - build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=8 KUBE_VERSION=v1.17.0 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.17.0_ ./hack/e2e.sh -- -preload-images --ginkgo.skip='\\[Serial\\]'", artifacts) + build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=6 KUBE_VERSION=v1.17.0 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.17.0_ ./hack/e2e.sh -- -preload-images --ginkgo.skip='\\[Serial\\]'", artifacts) } builds["E2E v1.12.10 Serial"] = { build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_serial_ ./hack/e2e.sh -- --preload-images --ginkgo.focus='\\[Serial\\]' --install-operator=false", artifacts) From a00c5b4c92959a9745cf3b1a807b8a5a97676415 Mon Sep 17 00:00:00 2001 From: weekface Date: Thu, 6 Feb 2020 09:55:10 +0800 Subject: [PATCH 2/4] fix stability test (#1624) Co-authored-by: Yecheng Fu --- tests/actions.go | 11 ++++++++++- tests/cmd/stability/main.go | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index 23c98123860..c57126f8ae0 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -814,6 +814,11 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { return fmt.Errorf("failed to delete dir pod %v", err) } + err = oa.kubeCli.CoreV1().Pods(info.Namespace).Delete(blockWriterPodName(info), nil) + if err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("failed to delete blockwriter pod %v", err) + } + err = oa.kubeCli.CoreV1().Secrets(info.Namespace).Delete(info.InitSecretName, &metav1.DeleteOptions{}) if err != nil && !errors.IsNotFound(err) { return fmt.Errorf("failed to delete secret: %s, %v", info.InitSecretName, err) @@ -1034,7 +1039,7 @@ func (oa *operatorActions) getBlockWriterPod(info *TidbClusterConfig, database s return &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Namespace: info.Namespace, - Name: "blockwriter", + Name: blockWriterPodName(info), Labels: map[string]string{ "app": "blockwriter", }, @@ -3471,3 +3476,7 @@ func StartValidatingAdmissionWebhookServerOrDie(context *apimachinery.CertContex panic(fmt.Sprintf("failed to start webhook server %v", err)) } } + +func blockWriterPodName(info *TidbClusterConfig) string { + return fmt.Sprintf("%s-blockwriter", info.ClusterName) +} diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index aa864f737d6..567e193b3c7 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -418,6 +418,7 @@ func newTidbClusterConfig(ns, clusterName string) *tests.TidbClusterConfig { PDImage: fmt.Sprintf("pingcap/pd:%s", tidbVersion), TiKVImage: fmt.Sprintf("pingcap/tikv:%s", tidbVersion), TiDBImage: fmt.Sprintf("pingcap/tidb:%s", tidbVersion), + PumpImage: fmt.Sprintf("pingcap/tidb-binlog:%s", tidbVersion), StorageClassName: "local-storage", UserName: "root", Password: "admin", From bc9a923c95f543459f5981547a0da6cec1c527fa Mon Sep 17 00:00:00 2001 From: Yecheng Fu Date: Thu, 6 Feb 2020 11:02:44 +0800 Subject: [PATCH 3/4] clean containers on TERM signal to avoid cgroup leaking (#1642) * time out by ourselves Signed-off-by: Yecheng Fu * Clean containers on TERM signal in root process to avoid cgroup leaking. Signed-off-by: Yecheng Fu Co-authored-by: Song Gao --- ci/pingcap_tidb_operator_build_kind.groovy | 40 +++++++++++++--------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/ci/pingcap_tidb_operator_build_kind.groovy b/ci/pingcap_tidb_operator_build_kind.groovy index 062072bf6ae..f8a693997e9 100644 --- a/ci/pingcap_tidb_operator_build_kind.groovy +++ b/ci/pingcap_tidb_operator_build_kind.groovy @@ -17,8 +17,19 @@ spec: image: gcr.io/k8s-testimages/kubekins-e2e:v20191108-9467d02-master command: - runner.sh - - sleep - - 99d + # Clean containers on TERM signal in root process to avoid cgroup leaking. + # https://github.com/pingcap/tidb-operator/issues/1603#issuecomment-582402196 + - exec + - bash + - -c + - | + function clean() { + echo "info: clean all containers to avoid cgroup leaking" + docker kill $(docker ps -q) || true + docker system prune -af || true + } + trap clean TERM + sleep 1d & wait # we need privileged mode in order to do docker in docker securityContext: privileged: true @@ -61,18 +72,13 @@ spec: - name: docker-graph emptyDir: {} affinity: - #nodeAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # nodeSelectorTerms: - # - matchExpressions: - # - key: kubernetes.io/hostname - # operator: In - # values: - # - 172.16.5.64 - # - 172.16.5.65 - # - 172.16.5.67 - # - 172.16.5.68 - # - 172.16.5.70 + # worker nodes only + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/master + operator: DoesNotExist podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 @@ -223,13 +229,13 @@ def call(BUILD_BRANCH, CREDENTIALS_ID, CODECOV_CREDENTIALS_ID) { def MIRRORS = "DOCKER_IO_MIRROR=http://172.16.4.143:5000 QUAY_IO_MIRROR=http://172.16.4.143:5001" def builds = [:] builds["E2E v1.12.10"] = { - build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=6 KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_ ./hack/e2e.sh -- --preload-images --ginkgo.skip='\\[Serial\\]'", artifacts) + build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=8 KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_ ./hack/e2e.sh -- --preload-images --ginkgo.skip='\\[Serial\\]'", artifacts) } builds["E2E v1.12.10 AdvancedStatefulSet"] = { - build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=6 KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_advanced_statefulset ./hack/e2e.sh -- --preload-images --ginkgo.skip='\\[Serial\\]' --operator-features AdvancedStatefulSet=true", artifacts) + build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=8 KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_advanced_statefulset ./hack/e2e.sh -- --preload-images --ginkgo.skip='\\[Serial\\]' --operator-features AdvancedStatefulSet=true", artifacts) } builds["E2E v1.17.0"] = { - build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=6 KUBE_VERSION=v1.17.0 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.17.0_ ./hack/e2e.sh -- -preload-images --ginkgo.skip='\\[Serial\\]'", artifacts) + build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=8 KUBE_VERSION=v1.17.0 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.17.0_ ./hack/e2e.sh -- -preload-images --ginkgo.skip='\\[Serial\\]'", artifacts) } builds["E2E v1.12.10 Serial"] = { build("${MIRRORS} IMAGE_TAG=${GITHASH} SKIP_BUILD=y KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_serial_ ./hack/e2e.sh -- --preload-images --ginkgo.focus='\\[Serial\\]' --install-operator=false", artifacts) From 0329716603285c3b5dd1efe1570f7c09b8a31893 Mon Sep 17 00:00:00 2001 From: Yecheng Fu Date: Thu, 6 Feb 2020 11:51:24 +0800 Subject: [PATCH 4/4] use controller.GuaranteedUpdate to update TiDB Cluster CRD in e2e (#1632) --- tests/e2e/tidbcluster/tidbcluster.go | 121 +++++++++++++-------------- 1 file changed, 59 insertions(+), 62 deletions(-) diff --git a/tests/e2e/tidbcluster/tidbcluster.go b/tests/e2e/tidbcluster/tidbcluster.go index b470c10dc03..967a27880a9 100644 --- a/tests/e2e/tidbcluster/tidbcluster.go +++ b/tests/e2e/tidbcluster/tidbcluster.go @@ -348,27 +348,23 @@ var _ = ginkgo.Describe("[tidb-operator] TiDBCluster", func() { }) framework.ExpectNoError(err) - ginkgo.By(fmt.Sprintf("Sync TiDB service properties")) + ginkgo.By("Sync TiDB service properties") + ginkgo.By("Updating TiDB service") svcType := corev1.ServiceTypeNodePort trafficPolicy := corev1.ServiceExternalTrafficPolicyTypeLocal - - err = wait.PollImmediate(5*time.Second, 5*time.Minute, func() (bool, error) { - tc, err := cli.PingcapV1alpha1().TidbClusters(ns).Get(tcName, metav1.GetOptions{}) - framework.ExpectNoError(err, "Expected get TiDB cluster") + err = controller.GuaranteedUpdate(genericCli, tc, func() error { tc.Spec.TiDB.Service.Type = svcType tc.Spec.TiDB.Service.ExternalTrafficPolicy = &trafficPolicy tc.Spec.TiDB.Service.Annotations = map[string]string{ "test": "test", } - _, err = cli.PingcapV1alpha1().TidbClusters(ns).Update(tc) - if err != nil && !errors.IsConflict(err) { - return false, err - } - if errors.IsConflict(err) { - e2elog.Logf("conflicts when updating tidbcluster, retry...") - return false, nil - } + return nil + }) + framework.ExpectNoError(err) + + ginkgo.By("Waiting for the TiDB service to be synced") + err = wait.PollImmediate(5*time.Second, 5*time.Minute, func() (bool, error) { svc, err := c.CoreV1().Services(ns).Get(controller.TiDBMemberName(tcName), metav1.GetOptions{}) if err != nil { if errors.IsNotFound(err) { @@ -414,51 +410,6 @@ var _ = ginkgo.Describe("[tidb-operator] TiDBCluster", func() { tc, err := cli.PingcapV1alpha1().TidbClusters(cluster.Namespace).Get(cluster.ClusterName, metav1.GetOptions{}) framework.ExpectNoError(err, "Expected get tidbcluster") - pullPolicy := corev1.PullIfNotPresent - tc.Spec.Pump = &v1alpha1.PumpSpec{ - BaseImage: "pingcap/tidb-binlog", - ComponentSpec: v1alpha1.ComponentSpec{ - Version: &cluster.ClusterVersion, - ImagePullPolicy: &pullPolicy, - Affinity: &corev1.Affinity{ - PodAntiAffinity: &corev1.PodAntiAffinity{ - PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{ - { - PodAffinityTerm: corev1.PodAffinityTerm{ - Namespaces: []string{cluster.Namespace}, - TopologyKey: "rack", - }, - Weight: 50, - }, - }, - }, - }, - Tolerations: []corev1.Toleration{ - { - Effect: corev1.TaintEffectNoSchedule, - Key: "node-role", - Operator: corev1.TolerationOpEqual, - Value: "tidb", - }, - }, - SchedulerName: pointer.StringPtr("default-scheduler"), - ConfigUpdateStrategy: &updateStrategy, - }, - Replicas: 1, - StorageClassName: pointer.StringPtr("local-storage"), - ResourceRequirements: corev1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceStorage: resource.MustParse("10Gi"), - }, - }, - GenericConfig: tcconfig.New(map[string]interface{}{ - "addr": "0.0.0.0:8250", - "gc": 7, - "data-dir": "/data", - "heartbeat-interval": 2, - }), - } - // If using advanced statefulset, we must upgrade all Kubernetes statefulsets to advanced statefulsets first. if ocfg.Enabled(features.AdvancedStatefulSet) { stsList, err := c.AppsV1().StatefulSets(ns).List(metav1.ListOptions{}) @@ -475,7 +426,53 @@ var _ = ginkgo.Describe("[tidb-operator] TiDBCluster", func() { oldRev := oldPumpSet.Status.CurrentRevision framework.ExpectEqual(oldPumpSet.Status.UpdateRevision, oldRev, "Expected pump is not upgrading") - tcUpdated, err := cli.PingcapV1alpha1().TidbClusters(tc.Namespace).Update(tc) + err = controller.GuaranteedUpdate(genericCli, tc, func() error { + pullPolicy := corev1.PullIfNotPresent + tc.Spec.Pump = &v1alpha1.PumpSpec{ + BaseImage: "pingcap/tidb-binlog", + ComponentSpec: v1alpha1.ComponentSpec{ + Version: &cluster.ClusterVersion, + ImagePullPolicy: &pullPolicy, + Affinity: &corev1.Affinity{ + PodAntiAffinity: &corev1.PodAntiAffinity{ + PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{ + { + PodAffinityTerm: corev1.PodAffinityTerm{ + Namespaces: []string{cluster.Namespace}, + TopologyKey: "rack", + }, + Weight: 50, + }, + }, + }, + }, + Tolerations: []corev1.Toleration{ + { + Effect: corev1.TaintEffectNoSchedule, + Key: "node-role", + Operator: corev1.TolerationOpEqual, + Value: "tidb", + }, + }, + SchedulerName: pointer.StringPtr("default-scheduler"), + ConfigUpdateStrategy: &updateStrategy, + }, + Replicas: 1, + StorageClassName: pointer.StringPtr("local-storage"), + ResourceRequirements: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceStorage: resource.MustParse("10Gi"), + }, + }, + GenericConfig: tcconfig.New(map[string]interface{}{ + "addr": "0.0.0.0:8250", + "gc": 7, + "data-dir": "/data", + "heartbeat-interval": 2, + }), + } + return nil + }) framework.ExpectNoError(err, "Expected update tc") err = wait.PollImmediate(5*time.Second, 5*time.Minute, func() (bool, error) { @@ -487,7 +484,7 @@ var _ = ginkgo.Describe("[tidb-operator] TiDBCluster", func() { e2elog.Logf("error get pump statefulset: %v", err) return false, nil } - if !metav1.IsControlledBy(pumpSet, tcUpdated) { + if !metav1.IsControlledBy(pumpSet, tc) { e2elog.Logf("expect pump staetfulset adopted by tidbcluster, still waiting...") return false, nil } @@ -509,7 +506,7 @@ var _ = ginkgo.Describe("[tidb-operator] TiDBCluster", func() { e2elog.Logf("error get pump configmap: %v", err) return false, nil } - if !metav1.IsControlledBy(pumpConfigMap, tcUpdated) { + if !metav1.IsControlledBy(pumpConfigMap, tc) { e2elog.Logf("expect pump configmap adopted by tidbcluster, still waiting...") return false, nil } @@ -522,7 +519,7 @@ var _ = ginkgo.Describe("[tidb-operator] TiDBCluster", func() { e2elog.Logf("error get pump peer service: %v", err) return false, nil } - if !metav1.IsControlledBy(pumpPeerSvc, tcUpdated) { + if !metav1.IsControlledBy(pumpPeerSvc, tc) { e2elog.Logf("expect pump peer service adopted by tidbcluster, still waiting...") return false, nil }