From ca0a52bfeb8a3dbb86326863b34825405cec04a5 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 9 May 2019 15:26:57 +0800 Subject: [PATCH 01/19] support affinity --- .../tidb-cluster/templates/tidb-cluster.yaml | 21 +-- charts/tidb-cluster/values.yaml | 97 +++++++++++--- pkg/apis/pingcap.com/v1alpha1/types.go | 38 +++--- .../v1alpha1/zz_generated.deepcopy.go | 30 ++--- pkg/manager/member/pd_member_manager.go | 7 +- pkg/manager/member/tidb_member_manager.go | 7 +- pkg/manager/member/tikv_member_manager.go | 7 +- pkg/util/util.go | 120 ------------------ 8 files changed, 117 insertions(+), 210 deletions(-) diff --git a/charts/tidb-cluster/templates/tidb-cluster.yaml b/charts/tidb-cluster/templates/tidb-cluster.yaml index a0a91c4dab..f026d55e63 100644 --- a/charts/tidb-cluster/templates/tidb-cluster.yaml +++ b/charts/tidb-cluster/templates/tidb-cluster.yaml @@ -27,11 +27,8 @@ spec: {{- if .Values.pd.resources }} {{ toYaml .Values.pd.resources | indent 4 }} {{- end }} - {{- if .Values.pd.nodeSelector }} - nodeSelector: -{{ toYaml .Values.pd.nodeSelector | indent 6 }} - {{- end }} - nodeSelectorRequired: {{ .Values.nodeSelectorRequired | default true }} + affinity: +{{ toYaml .Values.pd.affinity | indent 6 }} {{- if .Values.pd.tolerations }} tolerations: {{ toYaml .Values.pd.tolerations | indent 4 }} @@ -46,11 +43,8 @@ spec: {{- if .Values.tikv.resources }} {{ toYaml .Values.tikv.resources | indent 4 }} {{- end }} - {{- if .Values.tikv.nodeSelector }} - nodeSelector: -{{ toYaml .Values.tikv.nodeSelector | indent 6 }} - {{- end }} - nodeSelectorRequired: {{ .Values.nodeSelectorRequired | default true }} + affinity: +{{ toYaml .Values.tikv.affinity | indent 6 }} {{- if .Values.tikv.tolerations }} tolerations: {{ toYaml .Values.tikv.tolerations | indent 4 }} @@ -68,11 +62,8 @@ spec: {{- if .Values.tidb.resources }} {{ toYaml .Values.tidb.resources | indent 4 }} {{- end }} - {{- if .Values.tidb.nodeSelector }} - nodeSelector: -{{ toYaml .Values.tidb.nodeSelector | indent 6 }} - {{- end }} - nodeSelectorRequired: {{ .Values.nodeSelectorRequired | default true }} + affinity: +{{ toYaml .Values.tidb.affinity | indent 6 }} {{- if .Values.tidb.tolerations }} tolerations: {{ toYaml .Values.tidb.tolerations | indent 4 }} diff --git a/charts/tidb-cluster/values.yaml b/charts/tidb-cluster/values.yaml index 6d264f0986..0dbe70da4e 100644 --- a/charts/tidb-cluster/values.yaml +++ b/charts/tidb-cluster/values.yaml @@ -66,16 +66,68 @@ pd: # cpu: 4000m # memory: 4Gi storage: 1Gi - # nodeSelector is used for scheduling pod, - # if nodeSelectorRequired is true, all the following labels must be matched - nodeSelector: {} - # kind: pd - # # zone is comma separated availability zone list - # zone: cn-bj1-01,cn-bj1-02 - # # region is comma separated region list - # region: cn-bj1 - # Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. - # refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration + + ## affinity defines pd scheduling rules,it's default settings is empty. + ## please read the affinity document before set your scheduling rule: + ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + ## The following is typical example of affinity settings: + ## The PodAntiAffinity setting of the example keeps PD pods does not co-locate on a topology node as far as possible to improve the high availability of PD on Kubernetes. + ## The NodeAffinity setting of the example ensure that the PD pods can only be scheduled to nodes with label:[type="pd"], + # affinity: + # podAntiAffinity: + # preferredDuringSchedulingIgnoredDuringExecution: + # # this term work when the nodes have the label named region + # - weight: 10 + # podAffinityTerm: + # labelSelector: + # matchLabels: + # app.kubernetes.io/instance: + # app.kubernetes.io/component: "pd" + # topologyKey: "region" + # namespaces: + # - + # # this term work when the nodes have the label named zone + # - weight: 20 + # podAffinityTerm: + # labelSelector: + # matchLabels: + # app.kubernetes.io/instance: + # app.kubernetes.io/component: "pd" + # topologyKey: "zone" + # namespaces: + # - + # # this term work when the nodes have the label named rack + # - weight: 40 + # podAffinityTerm: + # labelSelector: + # matchLabels: + # app.kubernetes.io/instance: + # app.kubernetes.io/component: "pd" + # topologyKey: "rack" + # namespaces: + # - + # # this term work when the nodes have the label named kubernetes.io/hostname + # - weight: 80 + # podAffinityTerm: + # labelSelector: + # matchLabels: + # app.kubernetes.io/instance: + # app.kubernetes.io/component: "pd" + # topologyKey: "kubernetes.io/hostname" + # namespaces: + # - + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: "kind" + # operator: In + # values: + # - "pd" + + ## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. + ## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration tolerations: [] # - key: node-role # operator: Equal @@ -107,10 +159,14 @@ tikv: # cpu: 12000m # memory: 24Gi storage: 10Gi - nodeSelector: {} - # kind: tikv - # zone: cn-bj1-01,cn-bj1-02 - # region: cn-bj1 + + ## affinity defines tikv scheduling rules,affinity default settings is empty. + ## please read the affinity document before set your scheduling rule: + ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + + ## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. + ## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration tolerations: [] # - key: node-role # operator: Equal @@ -166,10 +222,15 @@ tidb: requests: {} # cpu: 12000m # memory: 12Gi - nodeSelector: {} - # kind: tidb - # zone: cn-bj1-01,cn-bj1-02 - # region: cn-bj1 + + + ## affinity defines tikv scheduling rules,affinity default settings is empty. + ## please read the affinity document before set your scheduling rule: + ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + + ## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. + ## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration tolerations: [] # - key: node-role # operator: Equal diff --git a/pkg/apis/pingcap.com/v1alpha1/types.go b/pkg/apis/pingcap.com/v1alpha1/types.go index 458543df41..28579695a4 100644 --- a/pkg/apis/pingcap.com/v1alpha1/types.go +++ b/pkg/apis/pingcap.com/v1alpha1/types.go @@ -15,10 +15,9 @@ package v1alpha1 import ( apps "k8s.io/api/apps/v1beta1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - - corev1 "k8s.io/api/core/v1" ) const ( @@ -109,25 +108,23 @@ type TidbClusterStatus struct { // PDSpec contains details of PD member type PDSpec struct { ContainerSpec - Replicas int32 `json:"replicas"` - NodeSelector map[string]string `json:"nodeSelector,omitempty"` - NodeSelectorRequired bool `json:"nodeSelectorRequired,omitempty"` - StorageClassName string `json:"storageClassName,omitempty"` - Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + Replicas int32 `json:"replicas"` + Affinity *corev1.Affinity `json:"affinity,omitempty"` + StorageClassName string `json:"storageClassName,omitempty"` + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` } // TiDBSpec contains details of PD member type TiDBSpec struct { ContainerSpec - Replicas int32 `json:"replicas"` - NodeSelector map[string]string `json:"nodeSelector,omitempty"` - NodeSelectorRequired bool `json:"nodeSelectorRequired,omitempty"` - StorageClassName string `json:"storageClassName,omitempty"` - Tolerations []corev1.Toleration `json:"tolerations,omitempty"` - BinlogEnabled bool `json:"binlogEnabled,omitempty"` - MaxFailoverCount int32 `json:"maxFailoverCount,omitempty"` - SeparateSlowLog bool `json:"separateSlowLog,omitempty"` - SlowLogTailer TiDBSlowLogTailerSpec `json:"slowLogTailer,omitempty"` + Replicas int32 `json:"replicas"` + Affinity *corev1.Affinity `json:"affinity,omitempty"` + StorageClassName string `json:"storageClassName,omitempty"` + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + BinlogEnabled bool `json:"binlogEnabled,omitempty"` + MaxFailoverCount int32 `json:"maxFailoverCount,omitempty"` + SeparateSlowLog bool `json:"separateSlowLog,omitempty"` + SlowLogTailer TiDBSlowLogTailerSpec `json:"slowLogTailer,omitempty"` } // TiDBSlowLogTailerSpec represents an optional log tailer sidecar with TiDB @@ -138,11 +135,10 @@ type TiDBSlowLogTailerSpec struct { // TiKVSpec contains details of PD member type TiKVSpec struct { ContainerSpec - Replicas int32 `json:"replicas"` - NodeSelector map[string]string `json:"nodeSelector,omitempty"` - NodeSelectorRequired bool `json:"nodeSelectorRequired,omitempty"` - StorageClassName string `json:"storageClassName,omitempty"` - Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + Replicas int32 `json:"replicas"` + Affinity *corev1.Affinity `json:"affinity,omitempty"` + StorageClassName string `json:"storageClassName,omitempty"` + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` } // TiKVPromGatewaySpec runs as a sidecar with TiKVSpec diff --git a/pkg/apis/pingcap.com/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/pingcap.com/v1alpha1/zz_generated.deepcopy.go index 9ae44e85c5..4b828246dc 100644 --- a/pkg/apis/pingcap.com/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/pingcap.com/v1alpha1/zz_generated.deepcopy.go @@ -89,12 +89,10 @@ func (in *PDMember) DeepCopy() *PDMember { func (in *PDSpec) DeepCopyInto(out *PDSpec) { *out = *in in.ContainerSpec.DeepCopyInto(&out.ContainerSpec) - if in.NodeSelector != nil { - in, out := &in.NodeSelector, &out.NodeSelector - *out = make(map[string]string, len(*in)) - for key, val := range *in { - (*out)[key] = val - } + if in.Affinity != nil { + in, out := &in.Affinity, &out.Affinity + *out = new(v1.Affinity) + (*in).DeepCopyInto(*out) } if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations @@ -238,12 +236,10 @@ func (in *TiDBSlowLogTailerSpec) DeepCopy() *TiDBSlowLogTailerSpec { func (in *TiDBSpec) DeepCopyInto(out *TiDBSpec) { *out = *in in.ContainerSpec.DeepCopyInto(&out.ContainerSpec) - if in.NodeSelector != nil { - in, out := &in.NodeSelector, &out.NodeSelector - *out = make(map[string]string, len(*in)) - for key, val := range *in { - (*out)[key] = val - } + if in.Affinity != nil { + in, out := &in.Affinity, &out.Affinity + *out = new(v1.Affinity) + (*in).DeepCopyInto(*out) } if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations @@ -338,12 +334,10 @@ func (in *TiKVPromGatewaySpec) DeepCopy() *TiKVPromGatewaySpec { func (in *TiKVSpec) DeepCopyInto(out *TiKVSpec) { *out = *in in.ContainerSpec.DeepCopyInto(&out.ContainerSpec) - if in.NodeSelector != nil { - in, out := &in.NodeSelector, &out.NodeSelector - *out = make(map[string]string, len(*in)) - for key, val := range *in { - (*out)[key] = val - } + if in.Affinity != nil { + in, out := &in.Affinity, &out.Affinity + *out = new(v1.Affinity) + (*in).DeepCopyInto(*out) } if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations diff --git a/pkg/manager/member/pd_member_manager.go b/pkg/manager/member/pd_member_manager.go index 8d5b0797b9..f9d51f70a7 100644 --- a/pkg/manager/member/pd_member_manager.go +++ b/pkg/manager/member/pd_member_manager.go @@ -472,12 +472,7 @@ func (pmm *pdMemberManager) getNewPDSetForTidbCluster(tc *v1alpha1.TidbCluster) }, Spec: corev1.PodSpec{ SchedulerName: tc.Spec.SchedulerName, - Affinity: util.AffinityForNodeSelector( - ns, - tc.Spec.PD.NodeSelectorRequired, - label.New().Instance(instanceName).PD(), - tc.Spec.PD.NodeSelector, - ), + Affinity: tc.Spec.PD.Affinity, Containers: []corev1.Container{ { Name: v1alpha1.PDMemberType.String(), diff --git a/pkg/manager/member/tidb_member_manager.go b/pkg/manager/member/tidb_member_manager.go index 4a4eb51e76..5706b9fde8 100644 --- a/pkg/manager/member/tidb_member_manager.go +++ b/pkg/manager/member/tidb_member_manager.go @@ -343,12 +343,7 @@ func (tmm *tidbMemberManager) getNewTiDBSetForTidbCluster(tc *v1alpha1.TidbClust }, Spec: corev1.PodSpec{ SchedulerName: tc.Spec.SchedulerName, - Affinity: util.AffinityForNodeSelector( - ns, - tc.Spec.TiDB.NodeSelectorRequired, - label.New().Instance(instanceName).TiDB(), - tc.Spec.TiDB.NodeSelector, - ), + Affinity: tc.Spec.TiDB.Affinity, Containers: containers, RestartPolicy: corev1.RestartPolicyAlways, Tolerations: tc.Spec.TiDB.Tolerations, diff --git a/pkg/manager/member/tikv_member_manager.go b/pkg/manager/member/tikv_member_manager.go index 8fe442a2bf..aaab1ea88e 100644 --- a/pkg/manager/member/tikv_member_manager.go +++ b/pkg/manager/member/tikv_member_manager.go @@ -332,12 +332,7 @@ func (tkmm *tikvMemberManager) getNewSetForTidbCluster(tc *v1alpha1.TidbCluster) }, Spec: corev1.PodSpec{ SchedulerName: tc.Spec.SchedulerName, - Affinity: util.AffinityForNodeSelector( - ns, - tc.Spec.TiKV.NodeSelectorRequired, - tikvLabel, - tc.Spec.TiKV.NodeSelector, - ), + Affinity: tc.Spec.TiKV.Affinity, Containers: []corev1.Container{ { Name: v1alpha1.TiKVMemberType.String(), diff --git a/pkg/util/util.go b/pkg/util/util.go index acb5a03c0f..ba196f362c 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -15,7 +15,6 @@ package util import ( "fmt" - "sort" "strconv" "strings" @@ -23,127 +22,8 @@ import ( "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/kubernetes/pkg/kubelet/apis" ) -var ( - // weight is in range 1-100 - topologySchedulingWeight = map[string]int32{ - "region": 10, - "zone": 20, - "rack": 40, - apis.LabelHostname: 80, - } -) - -// AntiAffinityForPod creates a PodAntiAffinity with antiLabels -func AntiAffinityForPod(namespace string, antiLabels map[string]string) *corev1.PodAntiAffinity { - keys := []string{} - for key := range topologySchedulingWeight { - keys = append(keys, key) - } - sort.Strings(keys) // we must use sorted selector, otherwise affinity may vary causing new statefulset generated and pod recreated - terms := []corev1.WeightedPodAffinityTerm{} - for _, key := range keys { - term := corev1.WeightedPodAffinityTerm{ - Weight: topologySchedulingWeight[key], - PodAffinityTerm: corev1.PodAffinityTerm{ - LabelSelector: &metav1.LabelSelector{MatchLabels: antiLabels}, - TopologyKey: key, - Namespaces: []string{namespace}, - }, - } - terms = append(terms, term) - } - return &corev1.PodAntiAffinity{ - PreferredDuringSchedulingIgnoredDuringExecution: terms, - } -} - -// AffinityForNodeSelector creates an Affinity for NodeSelector -// Externally we use NodeSelector for simplicity, -// while internally we convert it to affinity which can express complex scheduling rules -func AffinityForNodeSelector(namespace string, required bool, antiLabels, selector map[string]string) *corev1.Affinity { - if selector == nil { - return nil - } - affinity := &corev1.Affinity{} - if antiLabels != nil { - affinity.PodAntiAffinity = AntiAffinityForPod(namespace, antiLabels) - } - - keys := []string{} - for key := range selector { - keys = append(keys, key) - } - sort.Strings(keys) // we must use sorted selector, otherwise affinity may vary causing new statefulset generated and pod recreated - - requiredTerms := []corev1.NodeSelectorTerm{} - if required { // all nodeSelectors are required - var exps []corev1.NodeSelectorRequirement - for _, key := range keys { - requirement := corev1.NodeSelectorRequirement{ - Key: key, - Operator: corev1.NodeSelectorOpIn, - Values: strings.Split(selector[key], ","), - } - // NodeSelectorRequirement in the same MatchExpressions are ANDed otherwise ORed - exps = append(exps, requirement) - } - requiredTerms = append(requiredTerms, corev1.NodeSelectorTerm{MatchExpressions: exps}) - affinity.NodeAffinity = &corev1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ - NodeSelectorTerms: requiredTerms, - }, - } - return affinity - } - - preferredTerms := []corev1.PreferredSchedulingTerm{} - exps := []corev1.NodeSelectorRequirement{} - for _, key := range keys { - if selector[key] == "" { - continue - } - values := strings.Split(selector[key], ",") - // region,zone,rack,host are preferred labels, others are must match labels - if weight, ok := topologySchedulingWeight[key]; ok { - t := corev1.PreferredSchedulingTerm{ - Weight: weight, - Preference: corev1.NodeSelectorTerm{ - MatchExpressions: []corev1.NodeSelectorRequirement{ - { - Key: key, - Operator: corev1.NodeSelectorOpIn, - Values: values, - }, - }, - }, - } - preferredTerms = append(preferredTerms, t) - } else { - requirement := corev1.NodeSelectorRequirement{ - Key: key, - Operator: corev1.NodeSelectorOpIn, - Values: values, - } - // NodeSelectorRequirement in the same MatchExpressions are ANDed otherwise ORed - exps = append(exps, requirement) - } - } - requiredTerms = append(requiredTerms, corev1.NodeSelectorTerm{MatchExpressions: exps}) - - affinity.NodeAffinity = &corev1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ - NodeSelectorTerms: requiredTerms, - }, - PreferredDuringSchedulingIgnoredDuringExecution: preferredTerms, - } - - return affinity -} - // ResourceRequirement creates ResourceRequirements for MemberSpec // Optionally pass in a default value func ResourceRequirement(spec v1alpha1.ContainerSpec, defaultRequests ...corev1.ResourceRequirements) corev1.ResourceRequirements { From 2a3acc3cd6ceec1ed48b73cb6f882c66a8c245cb Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 10 May 2019 20:18:39 +0800 Subject: [PATCH 02/19] change user guide --- .../templates/config/_pd-config.tpl | 2 +- docs/operation-guide.md | 61 ++++++++++++++++--- 2 files changed, 55 insertions(+), 8 deletions(-) diff --git a/charts/tidb-cluster/templates/config/_pd-config.tpl b/charts/tidb-cluster/templates/config/_pd-config.tpl index daea702054..ea5d044f5a 100644 --- a/charts/tidb-cluster/templates/config/_pd-config.tpl +++ b/charts/tidb-cluster/templates/config/_pd-config.tpl @@ -82,7 +82,7 @@ max-replicas = {{ .Values.pd.maxReplicas }} # The placement priorities is implied by the order of label keys. # For example, ["zone", "rack"] means that we should place replicas to # different zones first, then to different racks if we don't have enough zones. -location-labels = ["zone", "rack", "host"] +location-labels = ["region", "zone", "rack", "host"] [label-property] # Do not assign region leaders to stores that have these tags. diff --git a/docs/operation-guide.md b/docs/operation-guide.md index 8ab0fc53f2..a30bd20ab9 100644 --- a/docs/operation-guide.md +++ b/docs/operation-guide.md @@ -11,21 +11,68 @@ $ namespace="tidb" > **Note:** The rest of the document will use `values.yaml` to reference `charts/tidb-cluster/values.yaml` +## Configuration + +TiDB Operator use `values.yaml` as TiDB cluster configuration file. It provides the default basic configuration which you can use directly for quick deployment, but if you have specific configuration requirements or for production deployment, you need to manually modify the variables in the `value.yaml` file. + +* Resource setting + + * CPU & Memory + + The default deployment doesn't set CPU and memory requests or limits for any of the pods, these settings can make TiDB cluster run on a small Kubernetes cluster like DinD or the default GKE cluster for testing. But for production deployment, you would likely to adjust the cpu, memory and storage resources according to the [recommendations](https://github.com/pingcap/docs/blob/master/op-guide/recommendation.md). + The resource limits should be equal or bigger than the resource requests, it is suggested to set limit and request equal to get [`Guaranteed` QoS]( https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/#create-a-pod-that-gets-assigned-a-qos-class-of-guaranteed). + + * Storage + + The variables `pd.storageClassName` and `tikv.storageClassName` in `values.yaml` are used to set `StorageClass` of pd and tikv,their default setting are `local-storage` with minimal size. + If you don't want to use the default `StorageClass` or your Kubernetes cluster does not support `local-storage` class, please execute the following command to find an available `StorageClass` and select the ones you want to provide to TiDB cluster. + + ```shell + $ kubectl get sc + ``` + +* HA setting + + TiDB cluster is a distributed database. Its high availability means that when any physical node failed, not only to ensure TiDB server is available, but also ensure the data is complete and available. + + How to guarantee high availability of TiDB cluster work on Kubernetes? + + We mainly solve the problem from the scheduling of services and data. + + * HA guarantee of TiDB server + + TiDB Operator provides a external scheduler to guarantee PD/TiKV/TiDB pods HA on host level. TiDB Cluster have set the external scheduler as default scheduler, you will find the setting in the variable `schedulerName` of `values.yaml`. + + In the other hand use `PodAntiAffinity` term of `affinity` to ensure HA on the other topology levels (e.g. rack, zone, region). + refer to the doc: [pod affnity & anti affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#inter-pod-affinity-and-anti-affinity-beta-feature), moreover `values.yaml` also provides a typical HA setting example in the comments of `pd.affinity`. + + * HA guarantee of data + + HA of data is guaranteed by TiDB Cluster itself. The only work Operator needs to do is that collects topology info from specific labels of Kubernetes nodes where TiKV Pod runs on and then PD will schedule data replicas auto according to the topology info. + Cause currently TiDB Operator can only recognize some specific labels, so you can only set nodes topology info with the following particular labels + + * `region`: region where node is located + * `zone`: zone where node is located + * `rack`: rack where node is located + * `kubernetes.io/hostname`: hostname of the node + + you can label topology info to nodes of Kubernetes cluster use the following command + ```shell + # The labels are optional + $ kubectl label node region= zone= rack= kubernetes.io/hostname= + ``` + +For other settings, the variables in `values.yaml` are self-explanatory with comments. You can modify them according to your need before installing the charts. + ## Deploy TiDB cluster -After TiDB Operator and Helm are deployed correctly, TiDB cluster can be deployed using following command: +After TiDB Operator and Helm are deployed correctly and configuration completed, TiDB cluster can be deployed using following command: ```shell $ helm install charts/tidb-cluster --name=${releaseName} --namespace=${namespace} $ kubectl get po -n ${namespace} -l app.kubernetes.io/instance=${releaseName} ``` -The default deployment doesn't set CPU and memory requests or limits for any of the pods, and the storage used is `local-storage` with minimal size. These settings can make TiDB cluster run on a small Kubernetes cluster like DinD or the default GKE cluster for testing. But for production deployment, you would likely to adjust the cpu, memory and storage resources according to the [recommendations](https://github.com/pingcap/docs/blob/master/op-guide/recommendation.md). - -The resource limits should be equal or bigger than the resource requests, it is suggested to set limit and request equal to get [`Guaranteed` QoS]( https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/#create-a-pod-that-gets-assigned-a-qos-class-of-guaranteed). - -For other settings, the variables in `values.yaml` are self-explanatory with comments. You can modify them according to your need before installing the charts. - ## Access TiDB cluster By default TiDB service is exposed using [`NodePort`](https://kubernetes.io/docs/concepts/services-networking/service/#nodeport). You can modify it to `ClusterIP` which will disable access from outside of the cluster. Or modify it to [`LoadBalancer`](https://kubernetes.io/docs/concepts/services-networking/service/#loadbalancer) if the underlining Kubernetes supports this kind of service. From dc8226b68bd8d19caefe48a98cf8e7196639f49d Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Tue, 14 May 2019 19:25:39 +0800 Subject: [PATCH 03/19] add DR testcase --- go.mod | 1 + tests/actions.go | 4 + tests/cmd/stability/main.go | 14 +++ tests/ha.go | 206 ++++++++++++++++++++++++++++++++++++ 4 files changed, 225 insertions(+) create mode 100644 tests/ha.go diff --git a/go.mod b/go.mod index 6b3ce80983..c857e9329e 100644 --- a/go.mod +++ b/go.mod @@ -82,6 +82,7 @@ require ( github.com/tmc/grpc-websocket-proxy v0.0.0-20171017195756-830351dc03c6 // indirect github.com/ugorji/go v1.1.1 // indirect github.com/unrolled/render v0.0.0-20180807193321-4206df6ff701 // indirect + github.com/urfave/negroni v1.0.0 // indirect github.com/xiang90/probing v0.0.0-20160813154853-07dd2e8dfe18 // indirect go.uber.org/atomic v1.3.2 // indirect go.uber.org/multierr v1.1.0 // indirect diff --git a/tests/actions.go b/tests/actions.go index ed2e1e5f74..feb46aacab 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -160,6 +160,10 @@ type OperatorActions interface { EmitEvent(info *TidbClusterConfig, msg string) BackupRestore(from, to *TidbClusterConfig) error BackupRestoreOrDie(from, to *TidbClusterConfig) + LabelNodes() error + LabelNodesOrDie() + CheckDR(info *TidbClusterConfig) error + CheckDataRegionDR(info *TidbClusterConfig) error } type operatorActions struct { diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 1f544bc593..2c3c86a199 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -97,6 +97,20 @@ func main() { Args: map[string]string{ "binlog.drainer.workerCount": "1024", "binlog.drainer.txnBatch": "512", + "pd.affinity": ` + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + # this term work when the nodes have the label named region + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: stability-cluster1 + app.kubernetes.io/component: "pd" + topologyKey: "rack" + namespaces: + - stability-cluster1 + `, }, Monitor: true, BlockWriteConfig: conf.BlockWriter, diff --git a/tests/ha.go b/tests/ha.go new file mode 100644 index 0000000000..d6a51bd3f5 --- /dev/null +++ b/tests/ha.go @@ -0,0 +1,206 @@ +package tests + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "time" + + "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pingcap/kvproto/pkg/pdpb" + "github.com/pingcap/tidb-operator/pkg/label" + "github.com/pingcap/tidb-operator/tests/slack" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" +) + +const ( + RackLabel = "rack" + RackNum = 3 +) + +// RegionInfo records detail region info for api usage. +type RegionInfo struct { + ID uint64 `json:"id"` + StartKey string `json:"start_key"` + EndKey string `json:"end_key"` + RegionEpoch *metapb.RegionEpoch `json:"epoch,omitempty"` + Peers []*metapb.Peer `json:"peers,omitempty"` + + Leader *metapb.Peer `json:"leader,omitempty"` + DownPeers []*pdpb.PeerStats `json:"down_peers,omitempty"` + PendingPeers []*metapb.Peer `json:"pending_peers,omitempty"` + WrittenBytes uint64 `json:"written_bytes,omitempty"` + ReadBytes uint64 `json:"read_bytes,omitempty"` + ApproximateSize int64 `json:"approximate_size,omitempty"` + ApproximateKeys int64 `json:"approximate_keys,omitempty"` +} + +// RegionsInfo contains some regions with the detailed region info. +type RegionsInfo struct { + Count int `json:"count"` + Regions []*RegionInfo `json:"regions"` +} + +func (oa *operatorActions) LabelNodes() error { + nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) + if err != nil { + return err + } + + for i, node := range nodes.Items { + index := i % RackNum + node.Labels[RackLabel] = fmt.Sprintf("rack%d", index) + oa.kubeCli.CoreV1().Nodes().Update(&node) + } + return nil +} + +func (oa *operatorActions) LabelNodesOrDie() { + err := oa.LabelNodes() + if err != nil { + slack.NotifyAndPanic(err) + } +} + +func (oa *operatorActions) CheckDR(cluster *TidbClusterConfig) error { + nodeMap := map[string]corev1.Node{} + nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) + if err != nil { + return err + } + for _, node := range nodes.Items { + nodeMap[node.Name] = node + } + + pds, err := oa.kubeCli.CoreV1().Pods(cluster.Namespace).List( + metav1.ListOptions{LabelSelector: labels.SelectorFromSet( + label.New().Instance(cluster.ClusterName).PD().Labels(), + ).String()}) + if err != nil { + return err + } + err = oa.checkDR(pds.Items, nodeMap) + if err != nil { + return err + } + + tikvs, err := oa.kubeCli.CoreV1().Pods(cluster.Namespace).List( + metav1.ListOptions{LabelSelector: labels.SelectorFromSet( + label.New().Instance(cluster.ClusterName).TiKV().Labels(), + ).String()}) + if err != nil { + return err + } + err = oa.checkDR(tikvs.Items, nodeMap) + if err != nil { + return err + } + + tidbs, err := oa.kubeCli.CoreV1().Pods(cluster.Namespace).List( + metav1.ListOptions{LabelSelector: labels.SelectorFromSet( + label.New().Instance(cluster.ClusterName).TiDB().Labels(), + ).String()}) + if err != nil { + return err + } + return oa.checkDR(tidbs.Items, nodeMap) +} + +func (oa *operatorActions) checkDR(allPods []corev1.Pod, nodeMap map[string]corev1.Node) error { + rackPods := map[string][]corev1.Pod{} + for _, pod := range allPods { + if node, exist := nodeMap[pod.Spec.NodeName]; exist { + pods, exist := rackPods[node.Labels[RackLabel]] + if !exist { + pods = []corev1.Pod{} + } + pods = append(pods, pod) + rackPods[node.Labels[RackLabel]] = pods + } + } + + podNum := len(allPods) + maxPodsOneRack := podNum / RackNum + mod := podNum % RackNum + if mod > 0 { + maxPodsOneRack = maxPodsOneRack + 1 + } + + for rack, pods := range rackPods { + if len(pods) > maxPodsOneRack { + return fmt.Errorf("the rack:[%s] have pods more than %d", rack, maxPodsOneRack) + } + } + + return nil +} + +func (oa *operatorActions) CheckDataRegionDR(cluster *TidbClusterConfig) error { + pdClient := http.Client{ + Timeout: 10 * time.Second, + } + url := fmt.Sprintf("http://%s-pd.%s:2379/pd/api/v1/regions", cluster.ClusterName, cluster.Namespace) + resp, err := pdClient.Get(url) + if err != nil { + return err + } + buf, _ := ioutil.ReadAll(resp.Body) + regions := &RegionsInfo{} + err = json.Unmarshal(buf, ®ions) + if err != nil { + return err + } + + rackNodeMap, err := oa.getNodeRackMap() + if err != nil { + return err + } + + for _, region := range regions.Regions { + regionRacks := map[string]uint64{} + for _, peer := range region.Peers { + nodeName, err := oa.getNodeByStoreId(string(peer.StoreId), cluster) + if err != nil { + return err + } + rackName := rackNodeMap[nodeName] + if otherID, exist := regionRacks[rackName]; exist { + return fmt.Errorf("region[%d]'s peer: [%d]and[%d] are in same rack:[%s]", region.ID, otherID, peer.Id, rackName) + } + regionRacks[rackName] = peer.Id + } + } + return nil +} + +func (oa *operatorActions) getNodeByStoreId(storeID string, cluster *TidbClusterConfig) (string, error) { + tc, err := oa.cli.PingcapV1alpha1().TidbClusters(cluster.Namespace).Get(cluster.ClusterName, metav1.GetOptions{}) + if err != nil { + return "", err + } + if store, exist := tc.Status.TiKV.Stores[storeID]; exist { + pod, err := oa.kubeCli.CoreV1().Pods(cluster.Namespace).Get(store.PodName, metav1.GetOptions{}) + if err != nil { + return "", err + } + return pod.Spec.NodeName, nil + } + + return "", fmt.Errorf("the storeID:[%s] is not exist in tidbCluster:[%s] Status", storeID, cluster.FullName()) +} + +func (oa *operatorActions) getNodeRackMap() (map[string]string, error) { + rackNodeMap := map[string]string{} + nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) + if err != nil { + return rackNodeMap, err + } + for _, node := range nodes.Items { + rackNodeMap[node.Name] = node.Labels[RackLabel] + } + + return rackNodeMap, nil +} From bb9026e499cc1a3334af0a7790e23efcb65b14d0 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 15 May 2019 14:20:24 +0800 Subject: [PATCH 04/19] run stability without download charts --- go.sum | 1 + tests/actions.go | 6 +++--- tests/cmd/e2e/main.go | 1 - tests/config.go | 11 +++++++---- tests/ha.go | 12 ++++++------ tests/images/stability-test/Dockerfile | 4 ++++ tests/manifests/e2e/e2e.yaml | 1 + 7 files changed, 22 insertions(+), 14 deletions(-) diff --git a/go.sum b/go.sum index 1865a3364c..9b1627bbe9 100644 --- a/go.sum +++ b/go.sum @@ -196,6 +196,7 @@ github.com/ugorji/go v1.1.1 h1:gmervu+jDMvXTbcHQ0pd2wee85nEoE0BsVyEuzkfK8w= github.com/ugorji/go v1.1.1/go.mod h1:hnLbHMwcvSihnDhEfx2/BzKp2xb0Y+ErdfYcrs9tkJQ= github.com/unrolled/render v0.0.0-20180807193321-4206df6ff701 h1:BJ/T25enw0WcbWqV132hGXRQdqCqe9XBzqh4AWVH7Bc= github.com/unrolled/render v0.0.0-20180807193321-4206df6ff701/go.mod h1:tu82oB5W2ykJRVioYsB+IQKcft7ryBr7w12qMBUPyXg= +github.com/urfave/negroni v1.0.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4= github.com/xiang90/probing v0.0.0-20160813154853-07dd2e8dfe18 h1:MPPkRncZLN9Kh4MEFmbnK4h3BD7AUmskWv2+EeZJCCs= github.com/xiang90/probing v0.0.0-20160813154853-07dd2e8dfe18/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= go.uber.org/atomic v1.3.2 h1:2Oa65PReHzfn29GpvgsYwloV9AVFHPDk8tYxt2c2tr4= diff --git a/tests/actions.go b/tests/actions.go index feb46aacab..06d8eba7f8 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -162,8 +162,8 @@ type OperatorActions interface { BackupRestoreOrDie(from, to *TidbClusterConfig) LabelNodes() error LabelNodesOrDie() - CheckDR(info *TidbClusterConfig) error - CheckDataRegionDR(info *TidbClusterConfig) error + CheckDT(info *TidbClusterConfig) error + CheckDataRegionDT(info *TidbClusterConfig) error } type operatorActions struct { @@ -321,7 +321,7 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { func (oa *operatorActions) DeployOperator(info *OperatorConfig) error { glog.Infof("deploying tidb-operator %s", info.ReleaseName) - if info.Tag != "e2e" { + if info.Tag != "e2e" && info.Tag != "stability" { if err := oa.cloneOperatorRepo(); err != nil { return err } diff --git a/tests/cmd/e2e/main.go b/tests/cmd/e2e/main.go index f39d5495bb..ef8dd51650 100644 --- a/tests/cmd/e2e/main.go +++ b/tests/cmd/e2e/main.go @@ -31,7 +31,6 @@ func main() { defer logs.FlushLogs() conf := tests.ParseConfigOrDie() - conf.ChartDir = "/charts" cli, kubeCli := client.NewCliOrDie() oa := tests.NewOperatorActions(cli, kubeCli, 5*time.Second, conf, nil) diff --git a/tests/config.go b/tests/config.go index e73b6cf841..bcdbb70779 100644 --- a/tests/config.go +++ b/tests/config.go @@ -69,6 +69,7 @@ func NewConfig() (*Config, error) { flag.StringVar(&cfg.OperatorTag, "operator-tag", "master", "operator tag used to choose charts") flag.StringVar(&cfg.OperatorImage, "operator-image", "pingcap/tidb-operator:latest", "operator image") flag.StringVar(&cfg.OperatorRepoDir, "operator-repo-dir", "/tidb-operator", "local directory to which tidb-operator cloned") + flag.StringVar(&cfg.ChartDir, "chart-dir", "", "chart dir") flag.StringVar(&slack.WebhookUrl, "slack-webhook-url", "", "slack webhook url") flag.Parse() @@ -78,11 +79,13 @@ func NewConfig() (*Config, error) { } cfg.OperatorRepoDir = operatorRepo - chartDir, err := ioutil.TempDir("", "charts") - if err != nil { - return nil, err + if strings.TrimSpace(cfg.ChartDir) == "" { + chartDir, err := ioutil.TempDir("", "charts") + if err != nil { + return nil, err + } + cfg.ChartDir = chartDir } - cfg.ChartDir = chartDir return cfg, nil } diff --git a/tests/ha.go b/tests/ha.go index d6a51bd3f5..ecff80e049 100644 --- a/tests/ha.go +++ b/tests/ha.go @@ -65,7 +65,7 @@ func (oa *operatorActions) LabelNodesOrDie() { } } -func (oa *operatorActions) CheckDR(cluster *TidbClusterConfig) error { +func (oa *operatorActions) CheckDT(cluster *TidbClusterConfig) error { nodeMap := map[string]corev1.Node{} nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) if err != nil { @@ -82,7 +82,7 @@ func (oa *operatorActions) CheckDR(cluster *TidbClusterConfig) error { if err != nil { return err } - err = oa.checkDR(pds.Items, nodeMap) + err = oa.checkDT(pds.Items, nodeMap) if err != nil { return err } @@ -94,7 +94,7 @@ func (oa *operatorActions) CheckDR(cluster *TidbClusterConfig) error { if err != nil { return err } - err = oa.checkDR(tikvs.Items, nodeMap) + err = oa.checkDT(tikvs.Items, nodeMap) if err != nil { return err } @@ -106,10 +106,10 @@ func (oa *operatorActions) CheckDR(cluster *TidbClusterConfig) error { if err != nil { return err } - return oa.checkDR(tidbs.Items, nodeMap) + return oa.checkDT(tidbs.Items, nodeMap) } -func (oa *operatorActions) checkDR(allPods []corev1.Pod, nodeMap map[string]corev1.Node) error { +func (oa *operatorActions) checkDT(allPods []corev1.Pod, nodeMap map[string]corev1.Node) error { rackPods := map[string][]corev1.Pod{} for _, pod := range allPods { if node, exist := nodeMap[pod.Spec.NodeName]; exist { @@ -138,7 +138,7 @@ func (oa *operatorActions) checkDR(allPods []corev1.Pod, nodeMap map[string]core return nil } -func (oa *operatorActions) CheckDataRegionDR(cluster *TidbClusterConfig) error { +func (oa *operatorActions) CheckDataRegionDT(cluster *TidbClusterConfig) error { pdClient := http.Client{ Timeout: 10 * time.Second, } diff --git a/tests/images/stability-test/Dockerfile b/tests/images/stability-test/Dockerfile index 93ff613b40..06f6b68000 100644 --- a/tests/images/stability-test/Dockerfile +++ b/tests/images/stability-test/Dockerfile @@ -14,4 +14,8 @@ RUN curl https://storage.googleapis.com/kubernetes-release/release/${KUBECTL_VER rm -rf linux-amd64 && \ rm helm-${HELM_VERSION}-linux-amd64.tar.gz +ADD tidb-operator /charts/stability/tidb-operator +ADD tidb-cluster /charts/stability/tidb-cluster +ADD tidb-backup /charts/stability/tidb-backup + ADD bin/stability-test /usr/local/bin/stability-test diff --git a/tests/manifests/e2e/e2e.yaml b/tests/manifests/e2e/e2e.yaml index 9e20014ee9..a5dfeee471 100644 --- a/tests/manifests/e2e/e2e.yaml +++ b/tests/manifests/e2e/e2e.yaml @@ -49,6 +49,7 @@ spec: - --operator-tag=e2e - --operator-image=pingcap/tidb-operator:latest - --tidb-versions=v2.1.3,v2.1.4 + - --chart-dir=/charts volumeMounts: - mountPath: /logDir name: logdir From df111431fd24b8a520bd8fb0c7f1f18eba80a263 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 15 May 2019 16:06:43 +0800 Subject: [PATCH 05/19] change makefile --- .gitignore | 3 +++ Makefile | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/.gitignore b/.gitignore index bab2459fdf..9710b27ee7 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ tests/images/fault-trigger/bin/ tests/images/e2e/tidb-cluster/ tests/images/e2e/tidb-backup/ tests/images/e2e/tidb-operator/ +tests/images/stability-test/tidb-cluster/ +tests/images/stability-test/tidb-backup/ +tests/images/stability-test/tidb-operator/ *.tar tmp/ data/ diff --git a/Makefile b/Makefile index ff399bb8db..948d60e2ea 100644 --- a/Makefile +++ b/Makefile @@ -63,6 +63,12 @@ stability-test-build: $(GO) -ldflags '$(LDFLAGS)' -o tests/images/stability-test/bin/stability-test tests/cmd/stability/*.go stability-test-docker: stability-test-build + [ -d tests/images/stability-test/tidb-operator ] && rm -r tests/images/stability-test/tidb-operator || true + [ -d tests/images/stability-test/tidb-cluster ] && rm -r tests/images/stability-test/tidb-cluster || true + [ -d tests/images/stability-test/tidb-backup ] && rm -r tests/images/stability-test/tidb-backup || true + cp -r charts/tidb-operator tests/images/stability-test + cp -r charts/tidb-cluster tests/images/stability-test + cp -r charts/tidb-backup tests/images/stability-test docker build -t "${DOCKER_REGISTRY}/pingcap/tidb-operator-stability-test:latest" tests/images/stability-test stability-test-push: stability-test-docker From 341186ea7c7c95fdbdad805318fe6b967e7dc465 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 16 May 2019 14:10:09 +0800 Subject: [PATCH 06/19] fix some bugs --- tests/actions.go | 22 ++++++-- tests/cmd/stability/main.go | 104 +++++++++++++++++++++++++++++++----- tests/{ha.go => dt.go} | 31 ++++++++--- 3 files changed, 132 insertions(+), 25 deletions(-) rename tests/{ha.go => dt.go} (84%) diff --git a/tests/actions.go b/tests/actions.go index 06d8eba7f8..68fd5a9a0f 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -162,8 +162,10 @@ type OperatorActions interface { BackupRestoreOrDie(from, to *TidbClusterConfig) LabelNodes() error LabelNodesOrDie() - CheckDT(info *TidbClusterConfig) error - CheckDataRegionDT(info *TidbClusterConfig) error + CheckDisasterTolerance(info *TidbClusterConfig) error + CheckDisasterToleranceOrDie(info *TidbClusterConfig) + CheckDataRegionDisasterTolerance(info *TidbClusterConfig) error + CheckDataRegionDisasterToleranceOrDie(info *TidbClusterConfig) } type operatorActions struct { @@ -226,6 +228,7 @@ type TidbClusterConfig struct { BlockWriteConfig blockwriter.Config GrafanaClient *metrics.Client + SubValues string } func (oi *OperatorConfig) ConfigTLS() *tls.Config { @@ -411,7 +414,20 @@ func (oa *operatorActions) DeployTidbCluster(info *TidbClusterConfig) error { cmd := fmt.Sprintf("helm install %s --name %s --namespace %s --set-string %s", oa.tidbClusterChartPath(info.OperatorTag), info.ClusterName, info.Namespace, info.TidbClusterHelmSetString(nil)) - glog.Infof(cmd) + if strings.TrimSpace(info.SubValues) != "" { + subVaulesPath := fmt.Sprintf("%s/%s.yaml", oa.tidbClusterChartPath(info.OperatorTag), info.ClusterName) + svFile, err := os.Create(subVaulesPath) + if err != nil { + return err + } + defer svFile.Close() + _, err = svFile.WriteString(info.SubValues) + if err != nil { + return err + } + + cmd = fmt.Sprintf(" %s --values %s", cmd, subVaulesPath) + } if res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput(); err != nil { return fmt.Errorf("failed to deploy tidbcluster: %s/%s, %v, %s", info.Namespace, info.ClusterName, err, string(res)) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 2c3c86a199..c6f4d1ec82 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -97,23 +97,49 @@ func main() { Args: map[string]string{ "binlog.drainer.workerCount": "1024", "binlog.drainer.txnBatch": "512", - "pd.affinity": ` - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - # this term work when the nodes have the label named region - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: stability-cluster1 - app.kubernetes.io/component: "pd" - topologyKey: "rack" - namespaces: - - stability-cluster1 - `, }, Monitor: true, BlockWriteConfig: conf.BlockWriter, + SubValues: `pd: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: stability-cluster1 + app.kubernetes.io/component: "pd" + topologyKey: "rack" + namespaces: + - stability-cluster1 +tikv: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: stability-cluster1 + app.kubernetes.io/component: "tikv" + topologyKey: "rack" + namespaces: + - stability-cluster1 +tidb: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: stability-cluster1 + app.kubernetes.io/component: "tidb" + topologyKey: "rack" + namespaces: + - stability-cluster1 +`, } cluster2 := &tests.TidbClusterConfig{ Namespace: clusterName2, @@ -149,6 +175,46 @@ func main() { Args: map[string]string{}, Monitor: true, BlockWriteConfig: conf.BlockWriter, + SubValues: `pd: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: stability-cluster2 + app.kubernetes.io/component: "pd" + topologyKey: "rack" + namespaces: + - stability-cluster2 +tikv: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: stability-cluster2 + app.kubernetes.io/component: "tikv" + topologyKey: "rack" + namespaces: + - stability-cluster2 +tidb: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: stability-cluster2 + app.kubernetes.io/component: "tidb" + topologyKey: "rack" + namespaces: + - stability-cluster2 +`, } // cluster backup and restore @@ -169,6 +235,8 @@ func main() { oa.DumpAllLogs(operatorCfg, allClusters) }() + oa.LabelNodesOrDie() + // clean and deploy operator oa.CleanOperatorOrDie(operatorCfg) oa.DeployOperatorOrDie(operatorCfg) @@ -184,6 +252,10 @@ func main() { oa.CheckTidbClusterStatusOrDie(cluster1) oa.CheckTidbClusterStatusOrDie(cluster2) + // check disaster tolerance + oa.CheckDisasterToleranceOrDie(cluster1) + oa.CheckDisasterToleranceOrDie(cluster2) + go oa.BeginInsertDataToOrDie(cluster1) go oa.BeginInsertDataToOrDie(cluster2) @@ -218,6 +290,10 @@ func main() { // after upgrade cluster, clean webhook oa.CleanWebHookAndService(operatorCfg) + // check data regions disaster tolerance + oa.CheckDataRegionDisasterToleranceOrDie(cluster1) + oa.CheckDataRegionDisasterToleranceOrDie(cluster2) + // deploy and check cluster restore oa.DeployTidbClusterOrDie(clusterRestoreTo) oa.CheckTidbClusterStatusOrDie(clusterRestoreTo) diff --git a/tests/ha.go b/tests/dt.go similarity index 84% rename from tests/ha.go rename to tests/dt.go index ecff80e049..b969a0226c 100644 --- a/tests/ha.go +++ b/tests/dt.go @@ -5,6 +5,7 @@ import ( "fmt" "io/ioutil" "net/http" + "strconv" "time" "github.com/pingcap/kvproto/pkg/metapb" @@ -65,7 +66,7 @@ func (oa *operatorActions) LabelNodesOrDie() { } } -func (oa *operatorActions) CheckDT(cluster *TidbClusterConfig) error { +func (oa *operatorActions) CheckDisasterTolerance(cluster *TidbClusterConfig) error { nodeMap := map[string]corev1.Node{} nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) if err != nil { @@ -82,7 +83,7 @@ func (oa *operatorActions) CheckDT(cluster *TidbClusterConfig) error { if err != nil { return err } - err = oa.checkDT(pds.Items, nodeMap) + err = oa.checkDisasterTolerance(pds.Items, nodeMap) if err != nil { return err } @@ -94,7 +95,7 @@ func (oa *operatorActions) CheckDT(cluster *TidbClusterConfig) error { if err != nil { return err } - err = oa.checkDT(tikvs.Items, nodeMap) + err = oa.checkDisasterTolerance(tikvs.Items, nodeMap) if err != nil { return err } @@ -106,10 +107,10 @@ func (oa *operatorActions) CheckDT(cluster *TidbClusterConfig) error { if err != nil { return err } - return oa.checkDT(tidbs.Items, nodeMap) + return oa.checkDisasterTolerance(tidbs.Items, nodeMap) } -func (oa *operatorActions) checkDT(allPods []corev1.Pod, nodeMap map[string]corev1.Node) error { +func (oa *operatorActions) checkDisasterTolerance(allPods []corev1.Pod, nodeMap map[string]corev1.Node) error { rackPods := map[string][]corev1.Pod{} for _, pod := range allPods { if node, exist := nodeMap[pod.Spec.NodeName]; exist { @@ -134,11 +135,24 @@ func (oa *operatorActions) checkDT(allPods []corev1.Pod, nodeMap map[string]core return fmt.Errorf("the rack:[%s] have pods more than %d", rack, maxPodsOneRack) } } - return nil } -func (oa *operatorActions) CheckDataRegionDT(cluster *TidbClusterConfig) error { +func (oa *operatorActions) CheckDisasterToleranceOrDie(cluster *TidbClusterConfig) { + err := oa.CheckDisasterTolerance(cluster) + if err != nil { + slack.NotifyAndPanic(err) + } +} + +func (oa *operatorActions) CheckDataRegionDisasterToleranceOrDie(cluster *TidbClusterConfig) { + err := oa.CheckDataRegionDisasterTolerance(cluster) + if err != nil { + slack.NotifyAndPanic(err) + } +} + +func (oa *operatorActions) CheckDataRegionDisasterTolerance(cluster *TidbClusterConfig) error { pdClient := http.Client{ Timeout: 10 * time.Second, } @@ -162,7 +176,8 @@ func (oa *operatorActions) CheckDataRegionDT(cluster *TidbClusterConfig) error { for _, region := range regions.Regions { regionRacks := map[string]uint64{} for _, peer := range region.Peers { - nodeName, err := oa.getNodeByStoreId(string(peer.StoreId), cluster) + storeID := strconv.FormatUint(peer.StoreId, 10) + nodeName, err := oa.getNodeByStoreId(storeID, cluster) if err != nil { return err } From 972a4686d9860b706da25df865485ee540629cf8 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 16 May 2019 14:13:18 +0800 Subject: [PATCH 07/19] add comment --- tests/dt.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/dt.go b/tests/dt.go index b969a0226c..a21fc68aaa 100644 --- a/tests/dt.go +++ b/tests/dt.go @@ -1,3 +1,16 @@ +// Copyright 2019 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + package tests import ( From 9519cb9126d1dd078231cf170384c3072bc354cf Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 16 May 2019 15:23:18 +0800 Subject: [PATCH 08/19] add case to e2e --- tests/cmd/e2e/main.go | 92 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/tests/cmd/e2e/main.go b/tests/cmd/e2e/main.go index ef8dd51650..35475597b1 100644 --- a/tests/cmd/e2e/main.go +++ b/tests/cmd/e2e/main.go @@ -98,6 +98,46 @@ func main() { BatchSize: 1, RawSize: 1, }, + SubValues: `pd: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: e2e-cluster1 + app.kubernetes.io/component: "pd" + topologyKey: "rack" + namespaces: + - e2e-cluster1 +tikv: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: e2e-cluster1 + app.kubernetes.io/component: "tikv" + topologyKey: "rack" + namespaces: + - e2e-cluster1 +tidb: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: e2e-cluster1 + app.kubernetes.io/component: "tidb" + topologyKey: "rack" + namespaces: + - e2e-cluster1 +`, }, { Namespace: name2, @@ -136,6 +176,46 @@ func main() { BatchSize: 1, RawSize: 1, }, + SubValues: `pd: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: e2e-cluster2 + app.kubernetes.io/component: "pd" + topologyKey: "rack" + namespaces: + - e2e-cluster2 +tikv: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: e2e-cluster2 + app.kubernetes.io/component: "tikv" + topologyKey: "rack" + namespaces: + - e2e-cluster2 +tidb: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: e2e-cluster2 + app.kubernetes.io/component: "tidb" + topologyKey: "rack" + namespaces: + - e2e-cluster2 +`, }, } @@ -143,6 +223,8 @@ func main() { oa.DumpAllLogs(operatorInfo, clusterInfos) }() + oa.LabelNodesOrDie() + // deploy operator if err := oa.CleanOperator(operatorInfo); err != nil { oa.DumpAllLogs(operatorInfo, nil) @@ -169,6 +251,11 @@ func main() { } } + // check disaster tolerance + for _, clusterInfo := range clusterInfos { + oa.CheckDisasterToleranceOrDie(clusterInfo) + } + for _, clusterInfo := range clusterInfos { go oa.BeginInsertDataToOrDie(clusterInfo) } @@ -243,6 +330,11 @@ func main() { } } + // check data regions disaster tolerance + for _, clusterInfo := range clusterInfos { + oa.CheckDataRegionDisasterToleranceOrDie(clusterInfo) + } + // backup and restore backupClusterInfo := clusterInfos[0] restoreClusterInfo := &tests.TidbClusterConfig{} From eafe7fd6332319ca40547618af9fb404561c9112 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 16 May 2019 19:52:47 +0800 Subject: [PATCH 09/19] change operator's imagePullPolicy --- tests/actions.go | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 68fd5a9a0f..5cf46701fb 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -309,6 +309,7 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { "scheduler.logLevel": "2", "controllerManager.replicas": "2", "scheduler.replicas": "2", + "imagePullPolicy": "Always", } if oi.SchedulerTag != "" { set["scheduler.kubeSchedulerImageTag"] = oi.SchedulerTag @@ -386,6 +387,7 @@ func (oa *operatorActions) UpgradeOperator(info *OperatorConfig) error { --set operatorImage=%s`, info.ReleaseName, oa.operatorChartPath(info.Tag), info.Image) + res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() if err != nil { return fmt.Errorf("failed to upgrade operator to: %s, %v, %s", info.Image, err, string(res)) @@ -641,8 +643,7 @@ func (oa *operatorActions) ScaleTidbCluster(info *TidbClusterConfig) error { oa.EmitEvent(info, fmt.Sprintf("ScaleTidbCluster to pd: %s, tikv: %s, tidb: %s", info.Args["pd.replicas"], info.Args["tikv.replicas"], info.Args["tidb.replicas"])) - cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s", - info.ClusterName, oa.tidbClusterChartPath(info.OperatorTag), info.TidbClusterHelmSetString(nil)) + cmd := oa.getHelmUpgradeClusterCmd(info, nil) glog.Info("[SCALE] " + cmd) res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() if err != nil { @@ -722,8 +723,7 @@ func (oa *operatorActions) UpgradeTidbCluster(info *TidbClusterConfig) error { } oa.EmitEvent(info, "UpgradeTidbCluster") - cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s", - info.ClusterName, oa.tidbClusterChartPath(info.OperatorTag), info.TidbClusterHelmSetString(nil)) + cmd := oa.getHelmUpgradeClusterCmd(info, nil) glog.Info("[UPGRADE] " + cmd) res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() if err != nil { @@ -1743,10 +1743,7 @@ func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterConfig) error "scheduledBackup.secretName": info.BackupSecretName, } - setString := info.TidbClusterHelmSetString(sets) - - cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s", - info.ClusterName, oa.tidbClusterChartPath(info.OperatorTag), setString) + cmd := oa.getHelmUpgradeClusterCmd(info, sets) glog.Infof("scheduled-backup delploy [%s]", cmd) res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() @@ -1764,10 +1761,7 @@ func (oa *operatorActions) disableScheduledBackup(info *TidbClusterConfig) error "scheduledBackup.create": "false", } - setString := info.TidbClusterHelmSetString(sets) - - cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s", - info.ClusterName, oa.tidbClusterChartPath(info.OperatorTag), setString) + cmd := oa.getHelmUpgradeClusterCmd(info, sets) res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() if err != nil { @@ -1964,10 +1958,7 @@ func (oa *operatorActions) DeployIncrementalBackup(from *TidbClusterConfig, to * "binlog.drainer.ignoreSchemas": "", } - setString := from.TidbClusterHelmSetString(sets) - - cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s", - from.ClusterName, oa.tidbClusterChartPath(from.OperatorTag), setString) + cmd := oa.getHelmUpgradeClusterCmd(from, sets) glog.Infof(cmd) res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() if err != nil { @@ -2275,3 +2266,14 @@ func (oa *operatorActions) EventWorker() { ce.events = retryEvents } } + +func (oa *operatorActions) getHelmUpgradeClusterCmd(info *TidbClusterConfig, set map[string]string) string { + cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s", + info.ClusterName, oa.tidbClusterChartPath(info.OperatorTag), info.TidbClusterHelmSetString(set)) + if strings.TrimSpace(info.SubValues) != "" { + subVaulesPath := fmt.Sprintf("%s/%s.yaml", oa.tidbClusterChartPath(info.OperatorTag), info.ClusterName) + cmd = fmt.Sprintf(" %s --values %s", cmd, subVaulesPath) + } + + return cmd +} From 3f533b1c00987e5c1a9a4fa936c8fbe41ebf4588 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 22 May 2019 19:57:41 +0800 Subject: [PATCH 10/19] remove confilct --- .../tidb-cluster/templates/tidb-cluster.yaml | 9 ---- charts/tidb-cluster/values.yaml | 45 ------------------- 2 files changed, 54 deletions(-) diff --git a/charts/tidb-cluster/templates/tidb-cluster.yaml b/charts/tidb-cluster/templates/tidb-cluster.yaml index 0c27282360..a2d94ea355 100644 --- a/charts/tidb-cluster/templates/tidb-cluster.yaml +++ b/charts/tidb-cluster/templates/tidb-cluster.yaml @@ -35,11 +35,8 @@ spec: {{- end }} affinity: {{ toYaml .Values.pd.affinity | indent 6 }} -<<<<<<< HEAD -======= nodeSelector: {{ toYaml .Values.pd.nodeSelector | indent 6 }} ->>>>>>> master {{- if .Values.pd.tolerations }} tolerations: {{ toYaml .Values.pd.tolerations | indent 4 }} @@ -60,11 +57,8 @@ spec: {{- end }} affinity: {{ toYaml .Values.tikv.affinity | indent 6 }} -<<<<<<< HEAD -======= nodeSelector: {{ toYaml .Values.tikv.nodeSelector | indent 6 }} ->>>>>>> master {{- if .Values.tikv.tolerations }} tolerations: {{ toYaml .Values.tikv.tolerations | indent 4 }} @@ -82,11 +76,8 @@ spec: {{- end }} affinity: {{ toYaml .Values.tidb.affinity | indent 6 }} -<<<<<<< HEAD -======= nodeSelector: {{ toYaml .Values.tidb.nodeSelector | indent 6 }} ->>>>>>> master {{- if .Values.tidb.tolerations }} tolerations: {{ toYaml .Values.tidb.tolerations | indent 4 }} diff --git a/charts/tidb-cluster/values.yaml b/charts/tidb-cluster/values.yaml index 81190b35f4..18fd9c0d2a 100644 --- a/charts/tidb-cluster/values.yaml +++ b/charts/tidb-cluster/values.yaml @@ -79,11 +79,7 @@ pd: ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity affinity: {} ## The following is typical example of affinity settings: -<<<<<<< HEAD - ## The PodAntiAffinity setting of the example keeps PD pods does not co-locate on a topology node as far as possible to improve the high availability of PD on Kubernetes. -======= ## The PodAntiAffinity setting of the example keeps PD pods does not co-locate on a topology node as far as possible to improve the disaster tolerance of PD on Kubernetes. ->>>>>>> master ## The NodeAffinity setting of the example ensure that the PD pods can only be scheduled to nodes with label:[type="pd"], # affinity: # podAntiAffinity: @@ -93,73 +89,41 @@ pd: # podAffinityTerm: # labelSelector: # matchLabels: -<<<<<<< HEAD - # app.kubernetes.io/instance: - # app.kubernetes.io/component: "pd" - # topologyKey: "region" - # namespaces: - # - -======= # app.kubernetes.io/instance: # app.kubernetes.io/component: "pd" # topologyKey: "region" # namespaces: # - ->>>>>>> master # # this term work when the nodes have the label named zone # - weight: 20 # podAffinityTerm: # labelSelector: # matchLabels: -<<<<<<< HEAD - # app.kubernetes.io/instance: - # app.kubernetes.io/component: "pd" - # topologyKey: "zone" - # namespaces: - # - -======= # app.kubernetes.io/instance: # app.kubernetes.io/component: "pd" # topologyKey: "zone" # namespaces: # - ->>>>>>> master # # this term work when the nodes have the label named rack # - weight: 40 # podAffinityTerm: # labelSelector: # matchLabels: -<<<<<<< HEAD - # app.kubernetes.io/instance: - # app.kubernetes.io/component: "pd" - # topologyKey: "rack" - # namespaces: - # - -======= # app.kubernetes.io/instance: # app.kubernetes.io/component: "pd" # topologyKey: "rack" # namespaces: # - ->>>>>>> master # # this term work when the nodes have the label named kubernetes.io/hostname # - weight: 80 # podAffinityTerm: # labelSelector: # matchLabels: -<<<<<<< HEAD - # app.kubernetes.io/instance: - # app.kubernetes.io/component: "pd" - # topologyKey: "kubernetes.io/hostname" - # namespaces: - # - -======= # app.kubernetes.io/instance: # app.kubernetes.io/component: "pd" # topologyKey: "kubernetes.io/hostname" # namespaces: # - ->>>>>>> master # nodeAffinity: # requiredDuringSchedulingIgnoredDuringExecution: # nodeSelectorTerms: @@ -169,13 +133,10 @@ pd: # values: # - "pd" -<<<<<<< HEAD -======= ## nodeSelector ensure pods only assigning to nodes which have each of the indicated key-value pairs as labels ## ref:https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector nodeSelector: {} ->>>>>>> master ## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. ## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration tolerations: [] @@ -218,13 +179,10 @@ tikv: ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity affinity: {} -<<<<<<< HEAD -======= ## nodeSelector ensure pods only assigning to nodes which have each of the indicated key-value pairs as labels ## ref:https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector nodeSelector: {} ->>>>>>> master ## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. ## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration tolerations: [] @@ -309,13 +267,10 @@ tidb: ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity affinity: {} -<<<<<<< HEAD -======= ## nodeSelector ensure pods only assigning to nodes which have each of the indicated key-value pairs as labels ## ref:https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector nodeSelector: {} ->>>>>>> master ## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints. ## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration tolerations: [] From b960eb7dc472548f0d74de12818d47c79dd1add0 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 24 May 2019 13:53:51 +0800 Subject: [PATCH 11/19] address comment --- tests/cmd/e2e/main.go | 82 +----------------------------------- tests/cmd/stability/main.go | 83 ++----------------------------------- tests/dt.go | 30 ++++++++++---- tests/util.go | 49 ++++++++++++++++++++++ 4 files changed, 77 insertions(+), 167 deletions(-) diff --git a/tests/cmd/e2e/main.go b/tests/cmd/e2e/main.go index 33984ba405..39f3865ee4 100644 --- a/tests/cmd/e2e/main.go +++ b/tests/cmd/e2e/main.go @@ -102,46 +102,7 @@ func main() { BatchSize: 1, RawSize: 1, }, - SubValues: `pd: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: e2e-cluster1 - app.kubernetes.io/component: "pd" - topologyKey: "rack" - namespaces: - - e2e-cluster1 -tikv: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: e2e-cluster1 - app.kubernetes.io/component: "tikv" - topologyKey: "rack" - namespaces: - - e2e-cluster1 -tidb: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: e2e-cluster1 - app.kubernetes.io/component: "tidb" - topologyKey: "rack" - namespaces: - - e2e-cluster1 -`, + SubValues: tests.GetAffinityConfigOrDie(name1, name1), EnableConfigMapRollout: true, PDMaxReplicas: 3, TiKVGrpcConcurrency: 4, @@ -185,46 +146,7 @@ tidb: BatchSize: 1, RawSize: 1, }, - SubValues: `pd: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: e2e-cluster2 - app.kubernetes.io/component: "pd" - topologyKey: "rack" - namespaces: - - e2e-cluster2 -tikv: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: e2e-cluster2 - app.kubernetes.io/component: "tikv" - topologyKey: "rack" - namespaces: - - e2e-cluster2 -tidb: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: e2e-cluster2 - app.kubernetes.io/component: "tidb" - topologyKey: "rack" - namespaces: - - e2e-cluster2 -`, + SubValues: tests.GetAffinityConfigOrDie(name2, name2), EnableConfigMapRollout: false, PDMaxReplicas: 3, TiKVGrpcConcurrency: 4, diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 23f0297b1b..b22b8faff9 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -100,47 +100,9 @@ func main() { }, Monitor: true, BlockWriteConfig: conf.BlockWriter, - SubValues: `pd: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: stability-cluster1 - app.kubernetes.io/component: "pd" - topologyKey: "rack" - namespaces: - - stability-cluster1 -tikv: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: stability-cluster1 - app.kubernetes.io/component: "tikv" - topologyKey: "rack" - namespaces: - - stability-cluster1 -tidb: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: stability-cluster1 - app.kubernetes.io/component: "tidb" - topologyKey: "rack" - namespaces: - - stability-cluster1 -`, } + cluster1.SubValues = tests.GetAffinityConfigOrDie(cluster1.ClusterName, cluster1.Namespace) + cluster2 := &tests.TidbClusterConfig{ Namespace: clusterName2, ClusterName: clusterName2, @@ -175,47 +137,8 @@ tidb: Args: map[string]string{}, Monitor: true, BlockWriteConfig: conf.BlockWriter, - SubValues: `pd: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: stability-cluster2 - app.kubernetes.io/component: "pd" - topologyKey: "rack" - namespaces: - - stability-cluster2 -tikv: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: stability-cluster2 - app.kubernetes.io/component: "tikv" - topologyKey: "rack" - namespaces: - - stability-cluster2 -tidb: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/instance: stability-cluster2 - app.kubernetes.io/component: "tidb" - topologyKey: "rack" - namespaces: - - stability-cluster2 -`, } + cluster2.SubValues = tests.GetAffinityConfigOrDie(cluster2.ClusterName, cluster2.Namespace) // cluster backup and restore clusterBackupFrom := cluster1 diff --git a/tests/dt.go b/tests/dt.go index a21fc68aaa..389d45ec85 100644 --- a/tests/dt.go +++ b/tests/dt.go @@ -21,6 +21,8 @@ import ( "strconv" "time" + "github.com/golang/glog" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/tidb-operator/pkg/label" @@ -67,7 +69,11 @@ func (oa *operatorActions) LabelNodes() error { for i, node := range nodes.Items { index := i % RackNum node.Labels[RackLabel] = fmt.Sprintf("rack%d", index) - oa.kubeCli.CoreV1().Nodes().Update(&node) + _, err = oa.kubeCli.CoreV1().Nodes().Update(&node) + if err != nil { + glog.Errorf("label node:[%s] failed!", node.Name) + return err + } } return nil } @@ -96,7 +102,7 @@ func (oa *operatorActions) CheckDisasterTolerance(cluster *TidbClusterConfig) er if err != nil { return err } - err = oa.checkDisasterTolerance(pds.Items, nodeMap) + err = oa.checkPodsDisasterTolerance(pds.Items, nodeMap) if err != nil { return err } @@ -108,7 +114,7 @@ func (oa *operatorActions) CheckDisasterTolerance(cluster *TidbClusterConfig) er if err != nil { return err } - err = oa.checkDisasterTolerance(tikvs.Items, nodeMap) + err = oa.checkPodsDisasterTolerance(tikvs.Items, nodeMap) if err != nil { return err } @@ -120,10 +126,10 @@ func (oa *operatorActions) CheckDisasterTolerance(cluster *TidbClusterConfig) er if err != nil { return err } - return oa.checkDisasterTolerance(tidbs.Items, nodeMap) + return oa.checkPodsDisasterTolerance(tidbs.Items, nodeMap) } -func (oa *operatorActions) checkDisasterTolerance(allPods []corev1.Pod, nodeMap map[string]corev1.Node) error { +func (oa *operatorActions) checkPodsDisasterTolerance(allPods []corev1.Pod, nodeMap map[string]corev1.Node) error { rackPods := map[string][]corev1.Pod{} for _, pod := range allPods { if node, exist := nodeMap[pod.Spec.NodeName]; exist { @@ -144,9 +150,13 @@ func (oa *operatorActions) checkDisasterTolerance(allPods []corev1.Pod, nodeMap } for rack, pods := range rackPods { - if len(pods) > maxPodsOneRack { + podNumOnRack := len(pods) + if podNumOnRack > maxPodsOneRack { return fmt.Errorf("the rack:[%s] have pods more than %d", rack, maxPodsOneRack) } + if podNumOnRack < mod { + return fmt.Errorf("the rack:[%s] have pods less than %d", rack, mod) + } } return nil } @@ -185,8 +195,11 @@ func (oa *operatorActions) CheckDataRegionDisasterTolerance(cluster *TidbCluster if err != nil { return err } - + // check peers of region are located on difference racks + // by default region replicas is 3 and rack num is also 3 + // so each rack only have one peer of each data region; if not,return error for _, region := range regions.Regions { + // regionRacks is map of rackName and the peerID regionRacks := map[string]uint64{} for _, peer := range region.Peers { storeID := strconv.FormatUint(peer.StoreId, 10) @@ -195,9 +208,11 @@ func (oa *operatorActions) CheckDataRegionDisasterTolerance(cluster *TidbCluster return err } rackName := rackNodeMap[nodeName] + // if the rack have more than one peer of the region, return error if otherID, exist := regionRacks[rackName]; exist { return fmt.Errorf("region[%d]'s peer: [%d]and[%d] are in same rack:[%s]", region.ID, otherID, peer.Id, rackName) } + // add a new pair of rack and peer regionRacks[rackName] = peer.Id } } @@ -220,6 +235,7 @@ func (oa *operatorActions) getNodeByStoreId(storeID string, cluster *TidbCluster return "", fmt.Errorf("the storeID:[%s] is not exist in tidbCluster:[%s] Status", storeID, cluster.FullName()) } +// getNodeRackMap return the map of node and rack func (oa *operatorActions) getNodeRackMap() (map[string]string, error) { rackNodeMap := map[string]string{} nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) diff --git a/tests/util.go b/tests/util.go index 34ccab0081..f3c961807b 100644 --- a/tests/util.go +++ b/tests/util.go @@ -14,6 +14,9 @@ package tests import ( + "bytes" + "fmt" + "html/template" "math/rand" "time" @@ -78,3 +81,49 @@ func GetKubeComponent(kubeCli kubernetes.Interface, node string, componentName s } return nil, nil } + +var affinityTemp string = `{{.Kind}}: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: {{.Weight}} + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/instance: {{.ClusterName}} + app.kubernetes.io/component: {{.Kind}} + topologyKey: "rack" + namespaces: + - {{.Namespace}} +` + +func GetAffinityConfigOrDie(clusterName, namespace string) string { + temp, err := template.New("dt-affinity").Parse(affinityTemp) + if err != nil { + slack.NotifyAndPanic(err) + } + + type AffinityInfo struct { + ClusterName string + Kind string + Weight int + Namespace string + } + + pdbuff := new(bytes.Buffer) + err = temp.Execute(pdbuff, &AffinityInfo{ClusterName: clusterName, Kind: "pd", Weight: 10, Namespace: namespace}) + if err != nil { + slack.NotifyAndPanic(err) + } + tikvbuff := new(bytes.Buffer) + err = temp.Execute(tikvbuff, &AffinityInfo{ClusterName: clusterName, Kind: "tikv", Weight: 10, Namespace: namespace}) + if err != nil { + slack.NotifyAndPanic(err) + } + tidbbuff := new(bytes.Buffer) + err = temp.Execute(tidbbuff, &AffinityInfo{ClusterName: clusterName, Kind: "tidb", Weight: 10, Namespace: namespace}) + if err != nil { + slack.NotifyAndPanic(err) + } + return fmt.Sprintf("%s%s%s", pdbuff.String(), tikvbuff.String(), tidbbuff.String()) +} From cb8bf3031857999597ecd40605569230fa5e5a0c Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 24 May 2019 14:08:46 +0800 Subject: [PATCH 12/19] address comment --- tests/util.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/util.go b/tests/util.go index f3c961807b..0d430fdf70 100644 --- a/tests/util.go +++ b/tests/util.go @@ -97,19 +97,19 @@ var affinityTemp string = `{{.Kind}}: - {{.Namespace}} ` +type AffinityInfo struct { + ClusterName string + Kind string + Weight int + Namespace string +} + func GetAffinityConfigOrDie(clusterName, namespace string) string { temp, err := template.New("dt-affinity").Parse(affinityTemp) if err != nil { slack.NotifyAndPanic(err) } - type AffinityInfo struct { - ClusterName string - Kind string - Weight int - Namespace string - } - pdbuff := new(bytes.Buffer) err = temp.Execute(pdbuff, &AffinityInfo{ClusterName: clusterName, Kind: "pd", Weight: 10, Namespace: namespace}) if err != nil { From 438b57d273a2e4305d2b063db02be11ab6c6a16a Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 24 May 2019 15:16:49 +0800 Subject: [PATCH 13/19] address comment --- tests/actions.go | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index b040ddec64..8f4d6593d4 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -334,7 +334,6 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { "scheduler.logLevel": "2", "controllerManager.replicas": "2", "scheduler.replicas": "2", - "imagePullPolicy": "Always", } if oi.SchedulerTag != "" { set["scheduler.kubeSchedulerImageTag"] = oi.SchedulerTag From 3b943c7d0557453f92e23567a3d3080b05246252 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 24 May 2019 16:20:41 +0800 Subject: [PATCH 14/19] address comment --- tests/dt.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/dt.go b/tests/dt.go index 389d45ec85..3c60d6662a 100644 --- a/tests/dt.go +++ b/tests/dt.go @@ -143,7 +143,8 @@ func (oa *operatorActions) checkPodsDisasterTolerance(allPods []corev1.Pod, node } podNum := len(allPods) - maxPodsOneRack := podNum / RackNum + minPodsOneRack := podNum / RackNum + maxPodsOneRack := minPodsOneRack mod := podNum % RackNum if mod > 0 { maxPodsOneRack = maxPodsOneRack + 1 @@ -154,7 +155,7 @@ func (oa *operatorActions) checkPodsDisasterTolerance(allPods []corev1.Pod, node if podNumOnRack > maxPodsOneRack { return fmt.Errorf("the rack:[%s] have pods more than %d", rack, maxPodsOneRack) } - if podNumOnRack < mod { + if podNumOnRack < minPodsOneRack { return fmt.Errorf("the rack:[%s] have pods less than %d", rack, mod) } } From 56c23b4ee37476cef3768aa3189ef37494b8e121 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 24 May 2019 19:39:36 +0800 Subject: [PATCH 15/19] remove charts from stability image --- .gitignore | 3 --- Makefile | 6 ------ tests/actions.go | 2 +- tests/images/stability-test/Dockerfile | 4 ---- tests/manifests/e2e/e2e.yaml | 1 + 5 files changed, 2 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 7abad251ca..c735f5de6d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,9 +9,6 @@ tests/images/fault-trigger/bin/ tests/images/e2e/tidb-cluster/ tests/images/e2e/tidb-backup/ tests/images/e2e/tidb-operator/ -tests/images/stability-test/tidb-cluster/ -tests/images/stability-test/tidb-backup/ -tests/images/stability-test/tidb-operator/ tests/images/e2e/manifests/ *.tar tmp/ diff --git a/Makefile b/Makefile index 48c8eb8a60..243797757e 100644 --- a/Makefile +++ b/Makefile @@ -67,12 +67,6 @@ stability-test-build: $(GO) -ldflags '$(LDFLAGS)' -o tests/images/stability-test/bin/stability-test tests/cmd/stability/*.go stability-test-docker: stability-test-build - [ -d tests/images/stability-test/tidb-operator ] && rm -r tests/images/stability-test/tidb-operator || true - [ -d tests/images/stability-test/tidb-cluster ] && rm -r tests/images/stability-test/tidb-cluster || true - [ -d tests/images/stability-test/tidb-backup ] && rm -r tests/images/stability-test/tidb-backup || true - cp -r charts/tidb-operator tests/images/stability-test - cp -r charts/tidb-cluster tests/images/stability-test - cp -r charts/tidb-backup tests/images/stability-test docker build -t "${DOCKER_REGISTRY}/pingcap/tidb-operator-stability-test:latest" tests/images/stability-test stability-test-push: stability-test-docker diff --git a/tests/actions.go b/tests/actions.go index 87a33e5cea..cb182d1034 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -362,7 +362,7 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { func (oa *operatorActions) DeployOperator(info *OperatorConfig) error { glog.Infof("deploying tidb-operator %s", info.ReleaseName) - if info.Tag != "e2e" && info.Tag != "stability" { + if info.Tag != "e2e" { if err := oa.cloneOperatorRepo(); err != nil { return err } diff --git a/tests/images/stability-test/Dockerfile b/tests/images/stability-test/Dockerfile index 06f6b68000..93ff613b40 100644 --- a/tests/images/stability-test/Dockerfile +++ b/tests/images/stability-test/Dockerfile @@ -14,8 +14,4 @@ RUN curl https://storage.googleapis.com/kubernetes-release/release/${KUBECTL_VER rm -rf linux-amd64 && \ rm helm-${HELM_VERSION}-linux-amd64.tar.gz -ADD tidb-operator /charts/stability/tidb-operator -ADD tidb-cluster /charts/stability/tidb-cluster -ADD tidb-backup /charts/stability/tidb-backup - ADD bin/stability-test /usr/local/bin/stability-test diff --git a/tests/manifests/e2e/e2e.yaml b/tests/manifests/e2e/e2e.yaml index a5dfeee471..922934af43 100644 --- a/tests/manifests/e2e/e2e.yaml +++ b/tests/manifests/e2e/e2e.yaml @@ -50,6 +50,7 @@ spec: - --operator-image=pingcap/tidb-operator:latest - --tidb-versions=v2.1.3,v2.1.4 - --chart-dir=/charts + - --manifest-dir=/manifests volumeMounts: - mountPath: /logDir name: logdir From b6cf442cb0749fa18aa18e1481fcbf1259bc61f2 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Tue, 28 May 2019 17:33:55 +0800 Subject: [PATCH 16/19] fix e2e.yaml --- tests/manifests/e2e/e2e.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/manifests/e2e/e2e.yaml b/tests/manifests/e2e/e2e.yaml index 773366a2b1..25b1b91b32 100644 --- a/tests/manifests/e2e/e2e.yaml +++ b/tests/manifests/e2e/e2e.yaml @@ -50,7 +50,6 @@ spec: - --operator-image=pingcap/tidb-operator:latest - --tidb-versions=v3.0.0-beta.1,v3.0.0-rc.1 - --chart-dir=/charts - - --manifest-dir=/manifests volumeMounts: - mountPath: /logDir name: logdir From 35d5c781b1922893a7988fbf849f53d87cb447a9 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Tue, 28 May 2019 18:56:02 +0800 Subject: [PATCH 17/19] fix bug --- tests/actions.go | 2 ++ tests/cmd/e2e/main.go | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index d9226668e7..9caa271d61 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -504,6 +504,8 @@ func (oa *operatorActions) DeployTidbCluster(info *TidbClusterConfig) error { cmd = fmt.Sprintf(" %s --values %s", cmd, subVaulesPath) } + glog.Info(cmd) + if res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput(); err != nil { return fmt.Errorf("failed to deploy tidbcluster: %s/%s, %v, %s", info.Namespace, info.ClusterName, err, string(res)) diff --git a/tests/cmd/e2e/main.go b/tests/cmd/e2e/main.go index fe3043be75..0d36e855b6 100644 --- a/tests/cmd/e2e/main.go +++ b/tests/cmd/e2e/main.go @@ -15,10 +15,11 @@ package main import ( "fmt" - "k8s.io/api/core/v1" _ "net/http/pprof" "time" + "k8s.io/api/core/v1" + "github.com/golang/glog" "github.com/jinzhu/copier" "github.com/pingcap/tidb-operator/tests" @@ -168,6 +169,7 @@ func main() { "pd.replicas": "1", "discovery.image": conf.OperatorImage, }, + SubValues: tests.GetAffinityConfigOrDie(name3, name2), }, } From c8eaf91729f92b5f837bb0b8fec8af406c2af8c5 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 29 May 2019 01:09:39 +0800 Subject: [PATCH 18/19] remove initSql --- tests/actions.go | 2 -- tests/manifests/e2e/e2e.yaml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 9caa271d61..48fcebcdbf 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -223,7 +223,6 @@ type TidbClusterConfig struct { TiDBImage string StorageClassName string Password string - InitSQL string RecordCount string InsertBatchSize string Resources map[string]string @@ -297,7 +296,6 @@ func (tc *TidbClusterConfig) TidbClusterHelmSetString(m map[string]string) strin "tikv.image": tc.TiKVImage, "tidb.image": tc.TiDBImage, "tidb.passwordSecretName": tc.InitSecretName, - "tidb.initSql": tc.InitSQL, "monitor.create": strconv.FormatBool(tc.Monitor), "enableConfigMapRollout": strconv.FormatBool(tc.EnableConfigMapRollout), "pd.preStartScript": tc.PDPreStartScript, diff --git a/tests/manifests/e2e/e2e.yaml b/tests/manifests/e2e/e2e.yaml index 25b1b91b32..81f88c979c 100644 --- a/tests/manifests/e2e/e2e.yaml +++ b/tests/manifests/e2e/e2e.yaml @@ -42,7 +42,7 @@ spec: serviceAccount: tidb-operator-e2e containers: - name: tidb-operator-e2e - image: 127.0.0.1:5000/pingcap/tidb-operator-e2e:latest + image: hub.pingcap.net/chenxiaojing/tidb-operator-e2e:latest imagePullPolicy: Always command: - /usr/local/bin/e2e From 1558c31f50b6e51e3f0f574cc7d257ab41cded11 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 29 May 2019 19:34:30 +0800 Subject: [PATCH 19/19] change image name --- tests/manifests/e2e/e2e.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/manifests/e2e/e2e.yaml b/tests/manifests/e2e/e2e.yaml index 81f88c979c..25b1b91b32 100644 --- a/tests/manifests/e2e/e2e.yaml +++ b/tests/manifests/e2e/e2e.yaml @@ -42,7 +42,7 @@ spec: serviceAccount: tidb-operator-e2e containers: - name: tidb-operator-e2e - image: hub.pingcap.net/chenxiaojing/tidb-operator-e2e:latest + image: 127.0.0.1:5000/pingcap/tidb-operator-e2e:latest imagePullPolicy: Always command: - /usr/local/bin/e2e