From f575537a21e345afeb478b0aa4932de680a16a06 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 3 Apr 2019 13:58:55 +0800 Subject: [PATCH 01/16] add k8s/operator/tidbcluster status checker --- tests/actions.go | 196 +++++++++++++++++++++++++++++++++++++++++++++++ tests/util.go | 15 ++++ 2 files changed, 211 insertions(+) diff --git a/tests/actions.go b/tests/actions.go index e108b52afe6..803311ebe4d 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -54,6 +54,13 @@ const ( defaultRawSize = 100 period = 5 * time.Minute + + tidbControllerName string = "tidb-controller-manager" + tidbSchedulerName string = "tidb-scheduler" + + // NodeUnreachablePodReason is defined in k8s.io/kubernetes/pkg/util/node + // but not in client-go and apimachinery, so we define it here + NodeUnreachablePodReason = "NodeLost" ) func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, cfg *Config) OperatorActions { @@ -103,6 +110,9 @@ type OperatorActions interface { getBackupDir(info *TidbClusterInfo) ([]string, error) PendingFailover(info *TidbClusterInfo, faultPoint *time.Time) (bool, error) CheckFailover(info *TidbClusterInfo, faultNode string) (bool, error) + CheckTidbClustersKeepAvailable(infos []*TidbClusterInfo, period time.Duration) error + CheckK8sKeepAvailable(period time.Duration, excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error + CheckOperatorKeepAvailable(operatorInfo *OperatorInfo, period time.Duration) error } type operatorActions struct { @@ -2096,3 +2106,189 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterInfo, component string) ( return nodeMap, nil } + +func (oa *operatorActions) CheckK8sKeepAvailable(period time.Duration, excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error { + timeline := time.Now().Add(period) + for { + if time.Now().After(timeline) { + break + } + err := wait.Poll(3*time.Second, time.Minute, func() (bool, error) { + nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) + if err != nil { + glog.Errorf("failed to list nodes,error:%v", err) + return false, nil + } + for _, node := range nodes.Items { + if _, exist := excludeNodes[node.GetName()]; exist { + continue + } + if node.Status.Phase != corev1.NodeRunning { + return false, fmt.Errorf("node: [%s] is not in running", node.GetName()) + } + } + systemPods, err := oa.kubeCli.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) + if err != nil { + glog.Errorf("failed to list kube-system pods,error:%v", err) + return false, nil + } + for _, pod := range systemPods.Items { + if _, exist := excludePods[pod.GetName()]; exist { + continue + } + if GetPodStatus(&pod) != string(corev1.PodRunning) { + return false, fmt.Errorf("pod:[%s/%s] is unavailable", pod.GetName(), pod.GetNamespace()) + } + } + return true, nil + }) + if err != nil { + return err + } + time.Sleep(3 * time.Second) + } + return nil +} + +func (oa *operatorActions) CheckOperatorKeepAvailable(operatorInfo *OperatorInfo, period time.Duration) error { + timeline := time.Now().Add(period) + for { + if time.Now().After(timeline) { + break + } + wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { + controllerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorInfo.Namespace).Get(tidbControllerName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("failed to get deployment:%s failed,error:%v", tidbControllerName, err) + return false, nil + } + if controllerDeployment.Status.AvailableReplicas != *controllerDeployment.Spec.Replicas { + return false, fmt.Errorf("the %s is not available", tidbControllerName) + } + schedulerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorInfo.Namespace).Get(tidbSchedulerName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("failed to get deployment:%s failed,error:%v", tidbSchedulerName, err) + return false, nil + } + if schedulerDeployment.Status.AvailableReplicas != *schedulerDeployment.Spec.Replicas { + return false, fmt.Errorf("the %s is not available", tidbSchedulerName) + } + return true, nil + }) + time.Sleep(3 * time.Second) + } + return nil +} + +func (oa *operatorActions) CheckTidbClustersKeepAvailable(infos []*TidbClusterInfo, period time.Duration) error { + timeline := time.Now().Add(period) + for { + if time.Now().After(timeline) { + break + } + wait.Poll(3*time.Second, 30*time.Second, func() (bool, error) { + for _, info := range infos { + succ, err := oa.addDataToCluster(info) + if err != nil { + return false, err + } + if !succ { + return false, nil + } + } + return true, nil + }) + + time.Sleep(3 * time.Second) + } + return nil +} + +var testTableName = "testTable" + +func (op *operatorActions) addDataToCluster(info *TidbClusterInfo) (bool, error) { + db, err := sql.Open("mysql", getDSN(info.Namespace, info.ClusterName, "test", info.Password)) + if err != nil { + glog.Infof("cluster:[%s] can't open connection to mysql: %v", info.FullName(), err) + return false, nil + } + defer db.Close() + + _, err = db.Exec(fmt.Sprintf("CREATE TABLE %s (name VARCHAR(64))", testTableName)) + if err != nil && !tableAlreadyExist(err) { + glog.Infof("cluster:[%s] can't create table to mysql: %v", info.FullName(), err) + return false, nil + } + + _, err = db.Exec(fmt.Sprintf("INSERT INTO %s VALUES (?)", testTableName), "testValue") + if err != nil { + glog.Infof("cluster:[%s] can't insert data to mysql: %v", info.FullName(), err) + return false, nil + } + + return true, nil +} + +func GetPodStatus(pod *corev1.Pod) string { + reason := string(pod.Status.Phase) + if pod.Status.Reason != "" { + reason = pod.Status.Reason + } + + initializing := false + for i := range pod.Status.InitContainerStatuses { + container := pod.Status.InitContainerStatuses[i] + switch { + case container.State.Terminated != nil && container.State.Terminated.ExitCode == 0: + continue + case container.State.Terminated != nil: + // initialization is failed + if len(container.State.Terminated.Reason) == 0 { + if container.State.Terminated.Signal != 0 { + reason = fmt.Sprintf("Init:Signal:%d", container.State.Terminated.Signal) + } else { + reason = fmt.Sprintf("Init:ExitCode:%d", container.State.Terminated.ExitCode) + } + } else { + reason = "Init:" + container.State.Terminated.Reason + } + initializing = true + case container.State.Waiting != nil && len(container.State.Waiting.Reason) > 0 && container.State.Waiting.Reason != "PodInitializing": + reason = "Init:" + container.State.Waiting.Reason + initializing = true + default: + reason = fmt.Sprintf("Init:%d/%d", i, len(pod.Spec.InitContainers)) + initializing = true + } + break + } + if !initializing { + for i := len(pod.Status.ContainerStatuses) - 1; i >= 0; i-- { + container := pod.Status.ContainerStatuses[i] + + if container.State.Waiting != nil && container.State.Waiting.Reason != "" { + reason = container.State.Waiting.Reason + } else if container.State.Terminated != nil && container.State.Terminated.Reason != "" { + reason = container.State.Terminated.Reason + } else if container.State.Terminated != nil && container.State.Terminated.Reason == "" { + if container.State.Terminated.Signal != 0 { + reason = fmt.Sprintf("Signal:%d", container.State.Terminated.Signal) + } else { + reason = fmt.Sprintf("ExitCode:%d", container.State.Terminated.ExitCode) + } + } + } + } + + if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason { + reason = "Unknown" + } else if pod.DeletionTimestamp != nil { + reason = "Terminating" + } + + return reason +} + +func tableAlreadyExist(err error) bool { + return strings.Contains(err.Error(), "already exists") +} diff --git a/tests/util.go b/tests/util.go index ba3e11806b7..e67b2b13c17 100644 --- a/tests/util.go +++ b/tests/util.go @@ -39,3 +39,18 @@ func CreateKubeClient() (versioned.Interface, kubernetes.Interface, error) { return operatorCli, kubeCli, nil } + +// Keep will keep the fun running in the period, otherwise the fun return error +func Keep(interval time.Duration, period time.Duration, fun func() error) error { + timeline := time.Now().Add(period) + for { + if time.Now().After(timeline) { + break + } + err := fun() + if err != nil { + return err + } + } + return nil +} From c7511289c4a46c8a543a43688ad4386b2b9e0038 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 3 Apr 2019 13:58:55 +0800 Subject: [PATCH 02/16] add k8s/operator/tidbcluster status checker --- tests/actions.go | 196 +++++++++++++++++++++++++++++++++++++++++++++++ tests/util.go | 15 ++++ 2 files changed, 211 insertions(+) diff --git a/tests/actions.go b/tests/actions.go index e108b52afe6..803311ebe4d 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -54,6 +54,13 @@ const ( defaultRawSize = 100 period = 5 * time.Minute + + tidbControllerName string = "tidb-controller-manager" + tidbSchedulerName string = "tidb-scheduler" + + // NodeUnreachablePodReason is defined in k8s.io/kubernetes/pkg/util/node + // but not in client-go and apimachinery, so we define it here + NodeUnreachablePodReason = "NodeLost" ) func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, cfg *Config) OperatorActions { @@ -103,6 +110,9 @@ type OperatorActions interface { getBackupDir(info *TidbClusterInfo) ([]string, error) PendingFailover(info *TidbClusterInfo, faultPoint *time.Time) (bool, error) CheckFailover(info *TidbClusterInfo, faultNode string) (bool, error) + CheckTidbClustersKeepAvailable(infos []*TidbClusterInfo, period time.Duration) error + CheckK8sKeepAvailable(period time.Duration, excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error + CheckOperatorKeepAvailable(operatorInfo *OperatorInfo, period time.Duration) error } type operatorActions struct { @@ -2096,3 +2106,189 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterInfo, component string) ( return nodeMap, nil } + +func (oa *operatorActions) CheckK8sKeepAvailable(period time.Duration, excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error { + timeline := time.Now().Add(period) + for { + if time.Now().After(timeline) { + break + } + err := wait.Poll(3*time.Second, time.Minute, func() (bool, error) { + nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) + if err != nil { + glog.Errorf("failed to list nodes,error:%v", err) + return false, nil + } + for _, node := range nodes.Items { + if _, exist := excludeNodes[node.GetName()]; exist { + continue + } + if node.Status.Phase != corev1.NodeRunning { + return false, fmt.Errorf("node: [%s] is not in running", node.GetName()) + } + } + systemPods, err := oa.kubeCli.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) + if err != nil { + glog.Errorf("failed to list kube-system pods,error:%v", err) + return false, nil + } + for _, pod := range systemPods.Items { + if _, exist := excludePods[pod.GetName()]; exist { + continue + } + if GetPodStatus(&pod) != string(corev1.PodRunning) { + return false, fmt.Errorf("pod:[%s/%s] is unavailable", pod.GetName(), pod.GetNamespace()) + } + } + return true, nil + }) + if err != nil { + return err + } + time.Sleep(3 * time.Second) + } + return nil +} + +func (oa *operatorActions) CheckOperatorKeepAvailable(operatorInfo *OperatorInfo, period time.Duration) error { + timeline := time.Now().Add(period) + for { + if time.Now().After(timeline) { + break + } + wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { + controllerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorInfo.Namespace).Get(tidbControllerName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("failed to get deployment:%s failed,error:%v", tidbControllerName, err) + return false, nil + } + if controllerDeployment.Status.AvailableReplicas != *controllerDeployment.Spec.Replicas { + return false, fmt.Errorf("the %s is not available", tidbControllerName) + } + schedulerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorInfo.Namespace).Get(tidbSchedulerName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("failed to get deployment:%s failed,error:%v", tidbSchedulerName, err) + return false, nil + } + if schedulerDeployment.Status.AvailableReplicas != *schedulerDeployment.Spec.Replicas { + return false, fmt.Errorf("the %s is not available", tidbSchedulerName) + } + return true, nil + }) + time.Sleep(3 * time.Second) + } + return nil +} + +func (oa *operatorActions) CheckTidbClustersKeepAvailable(infos []*TidbClusterInfo, period time.Duration) error { + timeline := time.Now().Add(period) + for { + if time.Now().After(timeline) { + break + } + wait.Poll(3*time.Second, 30*time.Second, func() (bool, error) { + for _, info := range infos { + succ, err := oa.addDataToCluster(info) + if err != nil { + return false, err + } + if !succ { + return false, nil + } + } + return true, nil + }) + + time.Sleep(3 * time.Second) + } + return nil +} + +var testTableName = "testTable" + +func (op *operatorActions) addDataToCluster(info *TidbClusterInfo) (bool, error) { + db, err := sql.Open("mysql", getDSN(info.Namespace, info.ClusterName, "test", info.Password)) + if err != nil { + glog.Infof("cluster:[%s] can't open connection to mysql: %v", info.FullName(), err) + return false, nil + } + defer db.Close() + + _, err = db.Exec(fmt.Sprintf("CREATE TABLE %s (name VARCHAR(64))", testTableName)) + if err != nil && !tableAlreadyExist(err) { + glog.Infof("cluster:[%s] can't create table to mysql: %v", info.FullName(), err) + return false, nil + } + + _, err = db.Exec(fmt.Sprintf("INSERT INTO %s VALUES (?)", testTableName), "testValue") + if err != nil { + glog.Infof("cluster:[%s] can't insert data to mysql: %v", info.FullName(), err) + return false, nil + } + + return true, nil +} + +func GetPodStatus(pod *corev1.Pod) string { + reason := string(pod.Status.Phase) + if pod.Status.Reason != "" { + reason = pod.Status.Reason + } + + initializing := false + for i := range pod.Status.InitContainerStatuses { + container := pod.Status.InitContainerStatuses[i] + switch { + case container.State.Terminated != nil && container.State.Terminated.ExitCode == 0: + continue + case container.State.Terminated != nil: + // initialization is failed + if len(container.State.Terminated.Reason) == 0 { + if container.State.Terminated.Signal != 0 { + reason = fmt.Sprintf("Init:Signal:%d", container.State.Terminated.Signal) + } else { + reason = fmt.Sprintf("Init:ExitCode:%d", container.State.Terminated.ExitCode) + } + } else { + reason = "Init:" + container.State.Terminated.Reason + } + initializing = true + case container.State.Waiting != nil && len(container.State.Waiting.Reason) > 0 && container.State.Waiting.Reason != "PodInitializing": + reason = "Init:" + container.State.Waiting.Reason + initializing = true + default: + reason = fmt.Sprintf("Init:%d/%d", i, len(pod.Spec.InitContainers)) + initializing = true + } + break + } + if !initializing { + for i := len(pod.Status.ContainerStatuses) - 1; i >= 0; i-- { + container := pod.Status.ContainerStatuses[i] + + if container.State.Waiting != nil && container.State.Waiting.Reason != "" { + reason = container.State.Waiting.Reason + } else if container.State.Terminated != nil && container.State.Terminated.Reason != "" { + reason = container.State.Terminated.Reason + } else if container.State.Terminated != nil && container.State.Terminated.Reason == "" { + if container.State.Terminated.Signal != 0 { + reason = fmt.Sprintf("Signal:%d", container.State.Terminated.Signal) + } else { + reason = fmt.Sprintf("ExitCode:%d", container.State.Terminated.ExitCode) + } + } + } + } + + if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason { + reason = "Unknown" + } else if pod.DeletionTimestamp != nil { + reason = "Terminating" + } + + return reason +} + +func tableAlreadyExist(err error) bool { + return strings.Contains(err.Error(), "already exists") +} diff --git a/tests/util.go b/tests/util.go index ba3e11806b7..e67b2b13c17 100644 --- a/tests/util.go +++ b/tests/util.go @@ -39,3 +39,18 @@ func CreateKubeClient() (versioned.Interface, kubernetes.Interface, error) { return operatorCli, kubeCli, nil } + +// Keep will keep the fun running in the period, otherwise the fun return error +func Keep(interval time.Duration, period time.Duration, fun func() error) error { + timeline := time.Now().Add(period) + for { + if time.Now().After(timeline) { + break + } + err := fun() + if err != nil { + return err + } + } + return nil +} From f7fa6b4eff917fba070713c26c96423b6409809b Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 4 Apr 2019 16:58:10 +0800 Subject: [PATCH 03/16] fix --- tests/cmd/stability/main.go | 92 +++++++++++++++++++++++++++++++++---- tests/util.go | 1 + 2 files changed, 85 insertions(+), 8 deletions(-) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 44d530ed3f0..3d0567468cd 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -23,17 +23,20 @@ import ( "time" "github.com/jinzhu/copier" - "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/apiserver/pkg/util/logs" - "k8s.io/client-go/kubernetes" + "github.com/pingcap/tidb-operator/tests/backup" "github.com/golang/glog" "github.com/pingcap/tidb-operator/tests" - "github.com/pingcap/tidb-operator/tests/backup" "github.com/pingcap/tidb-operator/tests/pkg/workload" "github.com/pingcap/tidb-operator/tests/pkg/workload/ddl" + + "k8s.io/api/core/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/apiserver/pkg/util/logs" + "k8s.io/client-go/kubernetes" ) func main() { @@ -49,7 +52,7 @@ func main() { // TODO read these args from config beginTidbVersion := "v2.1.0" - toTidbVersion := "v2.1.4" + //toTidbVersion := "v2.1.4" operatorTag := "master" operatorImage := "pingcap/tidb-operator:latest" @@ -65,7 +68,7 @@ func main() { ReleaseName: "operator", Image: operatorImage, Tag: operatorTag, - SchedulerImage: "gcr.io/google-containers/hyperkube:v1.12.1", + SchedulerImage: "gcr.io/google-containers/hyperkube", LogLevel: "2", } @@ -275,6 +278,79 @@ func main() { if err := testFailover(kubeCli, oa, fa, conf, clusterInfos); err != nil { glog.Fatal(err) } + + // + faultEtcd := selectNode(conf.ETCDs) + err = fa.StopETCD(faultEtcd) + if err != nil { + glog.Fatal(err) + } + defer fa.StartETCD(faultEtcd) + + err = oa.CheckK8sKeepAvailable(5*time.Minute, nil, nil) + if err != nil { + glog.Fatal(err) + } + err = oa.CheckOperatorKeepAvailable(operatorInfo, 5*time.Minute) + if err != nil { + glog.Fatal(err) + } + err = oa.CheckTidbClustersKeepAvailable(clusterInfos, 5*time.Minute) + if err != nil { + glog.Fatal(err) + } + err = fa.StartETCD(faultEtcd) + if err != nil { + glog.Fatal(err) + } + + faultApiserver := selectNode(conf.APIServers) + apiserverPod, err := getApiserverPod(kubeCli, faultApiserver) + if err != nil { + glog.Fatal(err) + } + err = fa.StopKubeAPIServer(faultApiserver) + if err != nil { + glog.Fatal(err) + } + err = oa.CheckK8sKeepAvailable(5*time.Minute, nil, map[string]*corev1.Pod{apiserverPod.GetName(): apiserverPod}) + if err != nil { + glog.Fatal(err) + } + err = oa.CheckOperatorKeepAvailable(operatorInfo, 5*time.Minute) + if err != nil { + glog.Fatal(err) + } + err = oa.CheckTidbClustersKeepAvailable(clusterInfos, 5*time.Minute) + if err != nil { + glog.Fatal(err) + } + err = fa.StartKubeAPIServer(faultApiserver) + if err != nil { + glog.Fatal(err) + } + +} + +func selectNode(nodes []tests.Nodes) string { + rand.Seed(time.Now().Unix()) + index := rand.Intn(len(nodes)) + return nodes[index].Nodes[0] +} + +func getApiserverPod(kubeCli kubernetes.Interface, node string) (*corev1.Pod, error) { + selector := labels.Set(map[string]string{"component": "kube-apiserver"}).AsSelector() + options := metav1.ListOptions{LabelSelector: selector.String()} + apiserverPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) + if err != nil { + return nil, err + } + for _, apiserverPod := range apiserverPods.Items { + if apiserverPod.Spec.NodeName == node { + return &apiserverPod, nil + } + } + return nil, nil } func testFailover( diff --git a/tests/util.go b/tests/util.go index e67b2b13c17..6e5e56cc268 100644 --- a/tests/util.go +++ b/tests/util.go @@ -51,6 +51,7 @@ func Keep(interval time.Duration, period time.Duration, fun func() error) error if err != nil { return err } + time.Sleep(interval) } return nil } From 1110e2fd7a8938e44e13a333d153d0b2dbc4bb85 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 4 Apr 2019 18:00:18 +0800 Subject: [PATCH 04/16] fix --- tests/actions.go | 139 +++++++++++++++--------------------- tests/cmd/stability/main.go | 52 ++++++++------ 2 files changed, 88 insertions(+), 103 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 0fc7ea5acad..f2c3eafb0a2 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -120,9 +120,9 @@ type OperatorActions interface { CheckFailoverOrDie(clusters []*TidbClusterConfig, faultNode string) CheckRecover(cluster *TidbClusterConfig) (bool, error) CheckRecoverOrDie(clusters []*TidbClusterConfig) - CheckK8sKeepAvailable(period time.Duration, excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error - CheckOperatorKeepAvailable(operatorConfig *OperatorConfig, period time.Duration) error - CheckTidbClustersKeepAvailable(infos []*TidbClusterConfig, period time.Duration) error + CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error + CheckOperatorAvailable(operatorConfig *OperatorConfig) error + CheckTidbClustersAvailable(infos []*TidbClusterConfig) error } type operatorActions struct { @@ -1917,101 +1917,74 @@ func (oa *operatorActions) drainerHealth(info *TidbClusterConfig, hostName strin return len(healths.PumpPos) > 0 && healths.Synced } -func (oa *operatorActions) CheckK8sKeepAvailable(period time.Duration, excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error { - timeline := time.Now().Add(period) - for { - if time.Now().After(timeline) { - break +func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error { + return wait.Poll(3*time.Second, time.Minute, func() (bool, error) { + nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) + if err != nil { + glog.Errorf("failed to list nodes,error:%v", err) + return false, nil } - err := wait.Poll(3*time.Second, time.Minute, func() (bool, error) { - nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) - if err != nil { - glog.Errorf("failed to list nodes,error:%v", err) - return false, nil + for _, node := range nodes.Items { + if _, exist := excludeNodes[node.GetName()]; exist { + continue } - for _, node := range nodes.Items { - if _, exist := excludeNodes[node.GetName()]; exist { - continue - } - if node.Status.Phase != corev1.NodeRunning { - return false, fmt.Errorf("node: [%s] is not in running", node.GetName()) - } + if node.Status.Phase != corev1.NodeRunning { + return false, fmt.Errorf("node: [%s] is not in running", node.GetName()) } - systemPods, err := oa.kubeCli.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) - if err != nil { - glog.Errorf("failed to list kube-system pods,error:%v", err) - return false, nil + } + systemPods, err := oa.kubeCli.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) + if err != nil { + glog.Errorf("failed to list kube-system pods,error:%v", err) + return false, nil + } + for _, pod := range systemPods.Items { + if _, exist := excludePods[pod.GetName()]; exist { + continue } - for _, pod := range systemPods.Items { - if _, exist := excludePods[pod.GetName()]; exist { - continue - } - if GetPodStatus(&pod) != string(corev1.PodRunning) { - return false, fmt.Errorf("pod:[%s/%s] is unavailable", pod.GetName(), pod.GetNamespace()) - } + if GetPodStatus(&pod) != string(corev1.PodRunning) { + return false, fmt.Errorf("pod:[%s/%s] is unavailable", pod.GetName(), pod.GetNamespace()) } - return true, nil - }) - if err != nil { - return err } - time.Sleep(3 * time.Second) - } - return nil + return true, nil + }) } -func (oa *operatorActions) CheckOperatorKeepAvailable(operatorConfig *OperatorConfig, period time.Duration) error { - timeline := time.Now().Add(period) - for { - if time.Now().After(timeline) { - break +func (oa *operatorActions) CheckOperatorAvailable(operatorConfig *OperatorConfig) error { + return wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { + controllerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbControllerName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("failed to get deployment:%s failed,error:%v", tidbControllerName, err) + return false, nil + } + if controllerDeployment.Status.AvailableReplicas != *controllerDeployment.Spec.Replicas { + return false, fmt.Errorf("the %s is not available", tidbControllerName) } - wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { - controllerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbControllerName, metav1.GetOptions{}) + schedulerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbSchedulerName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("failed to get deployment:%s failed,error:%v", tidbSchedulerName, err) + return false, nil + } + if schedulerDeployment.Status.AvailableReplicas != *schedulerDeployment.Spec.Replicas { + return false, fmt.Errorf("the %s is not available", tidbSchedulerName) + } + return true, nil + }) +} + +func (oa *operatorActions) CheckTidbClustersAvailable(infos []*TidbClusterConfig) error { + return wait.Poll(3*time.Second, 30*time.Second, func() (bool, error) { + for _, info := range infos { + succ, err := oa.addDataToCluster(info) if err != nil { - glog.Errorf("failed to get deployment:%s failed,error:%v", tidbControllerName, err) - return false, nil - } - if controllerDeployment.Status.AvailableReplicas != *controllerDeployment.Spec.Replicas { - return false, fmt.Errorf("the %s is not available", tidbControllerName) + return false, err } - schedulerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbSchedulerName, metav1.GetOptions{}) - if err != nil { - glog.Errorf("failed to get deployment:%s failed,error:%v", tidbSchedulerName, err) + if !succ { return false, nil } - if schedulerDeployment.Status.AvailableReplicas != *schedulerDeployment.Spec.Replicas { - return false, fmt.Errorf("the %s is not available", tidbSchedulerName) - } - return true, nil - }) - time.Sleep(3 * time.Second) - } - return nil -} - -func (oa *operatorActions) CheckTidbClustersKeepAvailable(infos []*TidbClusterConfig, period time.Duration) error { - timeline := time.Now().Add(period) - for { - if time.Now().After(timeline) { - break } - wait.Poll(3*time.Second, 30*time.Second, func() (bool, error) { - for _, info := range infos { - succ, err := oa.addDataToCluster(info) - if err != nil { - return false, err - } - if !succ { - return false, nil - } - } - return true, nil - }) + return true, nil + }) - time.Sleep(3 * time.Second) - } - return nil } var testTableName = "testTable" diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 2f2bf6f5314..48358806d4b 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -220,23 +220,28 @@ func main() { oa.CheckTidbClusterStatusOrDie(cluster) } - // + // stop one etcd node and k8s/operator/tidbcluster is available faultEtcd := selectNode(conf.ETCDs) err := fta.StopETCD(faultEtcd) if err != nil { glog.Fatal(err) } defer fta.StartETCD(faultEtcd) - - err = oa.CheckK8sKeepAvailable(5*time.Minute, nil, nil) - if err != nil { - glog.Fatal(err) - } - err = oa.CheckOperatorKeepAvailable(operatorCfg, 5*time.Minute) - if err != nil { - glog.Fatal(err) - } - err = oa.CheckTidbClustersKeepAvailable(allClusters, 5*time.Minute) + err = tests.Keep(3*time.Second, 10*time.Minute, func() error { + err := oa.CheckK8sAvailable(nil, nil) + if err != nil { + return err + } + err = oa.CheckOperatorAvailable(operatorCfg) + if err != nil { + return err + } + err = oa.CheckTidbClustersAvailable(allClusters) + if err != nil { + return err + } + return nil + }) if err != nil { glog.Fatal(err) } @@ -245,6 +250,7 @@ func main() { glog.Fatal(err) } + // stop one apiserver node and k8s/operator/tidbcluster is available faultApiserver := selectNode(conf.APIServers) apiserverPod, err := getApiserverPod(kubeCli, faultApiserver) if err != nil { @@ -254,15 +260,21 @@ func main() { if err != nil { glog.Fatal(err) } - err = oa.CheckK8sKeepAvailable(5*time.Minute, nil, map[string]*corev1.Pod{apiserverPod.GetName(): apiserverPod}) - if err != nil { - glog.Fatal(err) - } - err = oa.CheckOperatorKeepAvailable(operatorCfg, 5*time.Minute) - if err != nil { - glog.Fatal(err) - } - err = oa.CheckTidbClustersKeepAvailable(allClusters, 5*time.Minute) + err = tests.Keep(3*time.Second, 10*time.Minute, func() error { + err := oa.CheckK8sAvailable(nil, map[string]*corev1.Pod{apiserverPod.GetName(): apiserverPod}) + if err != nil { + return err + } + err = oa.CheckOperatorAvailable(operatorCfg) + if err != nil { + return err + } + err = oa.CheckTidbClustersAvailable(allClusters) + if err != nil { + return err + } + return nil + }) if err != nil { glog.Fatal(err) } From 78a1395ddaacd5dad988ff422631c7ef8dce666b Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 4 Apr 2019 19:51:40 +0800 Subject: [PATCH 05/16] fix --- tests/actions.go | 159 ----------------------------------- tests/cmd/stability/main.go | 66 +++------------ tests/failover.go | 160 ++++++++++++++++++++++++++++++++++++ tests/fault.go | 28 +++++++ tests/util.go | 52 ++++++------ 5 files changed, 226 insertions(+), 239 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index f2c3eafb0a2..c3eebe63362 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -1916,162 +1916,3 @@ func (oa *operatorActions) drainerHealth(info *TidbClusterConfig, hostName strin } return len(healths.PumpPos) > 0 && healths.Synced } - -func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error { - return wait.Poll(3*time.Second, time.Minute, func() (bool, error) { - nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) - if err != nil { - glog.Errorf("failed to list nodes,error:%v", err) - return false, nil - } - for _, node := range nodes.Items { - if _, exist := excludeNodes[node.GetName()]; exist { - continue - } - if node.Status.Phase != corev1.NodeRunning { - return false, fmt.Errorf("node: [%s] is not in running", node.GetName()) - } - } - systemPods, err := oa.kubeCli.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) - if err != nil { - glog.Errorf("failed to list kube-system pods,error:%v", err) - return false, nil - } - for _, pod := range systemPods.Items { - if _, exist := excludePods[pod.GetName()]; exist { - continue - } - if GetPodStatus(&pod) != string(corev1.PodRunning) { - return false, fmt.Errorf("pod:[%s/%s] is unavailable", pod.GetName(), pod.GetNamespace()) - } - } - return true, nil - }) -} - -func (oa *operatorActions) CheckOperatorAvailable(operatorConfig *OperatorConfig) error { - return wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { - controllerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbControllerName, metav1.GetOptions{}) - if err != nil { - glog.Errorf("failed to get deployment:%s failed,error:%v", tidbControllerName, err) - return false, nil - } - if controllerDeployment.Status.AvailableReplicas != *controllerDeployment.Spec.Replicas { - return false, fmt.Errorf("the %s is not available", tidbControllerName) - } - schedulerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbSchedulerName, metav1.GetOptions{}) - if err != nil { - glog.Errorf("failed to get deployment:%s failed,error:%v", tidbSchedulerName, err) - return false, nil - } - if schedulerDeployment.Status.AvailableReplicas != *schedulerDeployment.Spec.Replicas { - return false, fmt.Errorf("the %s is not available", tidbSchedulerName) - } - return true, nil - }) -} - -func (oa *operatorActions) CheckTidbClustersAvailable(infos []*TidbClusterConfig) error { - return wait.Poll(3*time.Second, 30*time.Second, func() (bool, error) { - for _, info := range infos { - succ, err := oa.addDataToCluster(info) - if err != nil { - return false, err - } - if !succ { - return false, nil - } - } - return true, nil - }) - -} - -var testTableName = "testTable" - -func (op *operatorActions) addDataToCluster(info *TidbClusterConfig) (bool, error) { - db, err := sql.Open("mysql", getDSN(info.Namespace, info.ClusterName, "test", info.Password)) - if err != nil { - glog.Infof("cluster:[%s] can't open connection to mysql: %v", info.FullName(), err) - return false, nil - } - defer db.Close() - - _, err = db.Exec(fmt.Sprintf("CREATE TABLE %s (name VARCHAR(64))", testTableName)) - if err != nil && !tableAlreadyExist(err) { - glog.Infof("cluster:[%s] can't create table to mysql: %v", info.FullName(), err) - return false, nil - } - - _, err = db.Exec(fmt.Sprintf("INSERT INTO %s VALUES (?)", testTableName), "testValue") - if err != nil { - glog.Infof("cluster:[%s] can't insert data to mysql: %v", info.FullName(), err) - return false, nil - } - - return true, nil -} - -func GetPodStatus(pod *corev1.Pod) string { - reason := string(pod.Status.Phase) - if pod.Status.Reason != "" { - reason = pod.Status.Reason - } - - initializing := false - for i := range pod.Status.InitContainerStatuses { - container := pod.Status.InitContainerStatuses[i] - switch { - case container.State.Terminated != nil && container.State.Terminated.ExitCode == 0: - continue - case container.State.Terminated != nil: - // initialization is failed - if len(container.State.Terminated.Reason) == 0 { - if container.State.Terminated.Signal != 0 { - reason = fmt.Sprintf("Init:Signal:%d", container.State.Terminated.Signal) - } else { - reason = fmt.Sprintf("Init:ExitCode:%d", container.State.Terminated.ExitCode) - } - } else { - reason = "Init:" + container.State.Terminated.Reason - } - initializing = true - case container.State.Waiting != nil && len(container.State.Waiting.Reason) > 0 && container.State.Waiting.Reason != "PodInitializing": - reason = "Init:" + container.State.Waiting.Reason - initializing = true - default: - reason = fmt.Sprintf("Init:%d/%d", i, len(pod.Spec.InitContainers)) - initializing = true - } - break - } - if !initializing { - for i := len(pod.Status.ContainerStatuses) - 1; i >= 0; i-- { - container := pod.Status.ContainerStatuses[i] - - if container.State.Waiting != nil && container.State.Waiting.Reason != "" { - reason = container.State.Waiting.Reason - } else if container.State.Terminated != nil && container.State.Terminated.Reason != "" { - reason = container.State.Terminated.Reason - } else if container.State.Terminated != nil && container.State.Terminated.Reason == "" { - if container.State.Terminated.Signal != 0 { - reason = fmt.Sprintf("Signal:%d", container.State.Terminated.Signal) - } else { - reason = fmt.Sprintf("ExitCode:%d", container.State.Terminated.ExitCode) - } - } - } - } - - if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason { - reason = "Unknown" - } else if pod.DeletionTimestamp != nil { - reason = "Terminating" - } - - return reason -} - -func tableAlreadyExist(err error) bool { - return strings.Contains(err.Error(), "already exists") -} diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 48358806d4b..d1de99a6bd5 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -15,7 +15,6 @@ package main import ( "fmt" - "math/rand" "net/http" _ "net/http/pprof" "time" @@ -28,10 +27,7 @@ import ( "github.com/pingcap/tidb-operator/tests/pkg/client" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" "k8s.io/apiserver/pkg/util/logs" - "k8s.io/client-go/kubernetes" ) func main() { @@ -221,13 +217,10 @@ func main() { } // stop one etcd node and k8s/operator/tidbcluster is available - faultEtcd := selectNode(conf.ETCDs) - err := fta.StopETCD(faultEtcd) - if err != nil { - glog.Fatal(err) - } - defer fta.StartETCD(faultEtcd) - err = tests.Keep(3*time.Second, 10*time.Minute, func() error { + faultEtcd := tests.SelectNode(conf.ETCDs) + fta.StopETCDOrDie(faultEtcd) + defer fta.StartETCDOrDie(faultEtcd) + tests.KeepOrDie(3*time.Second, 10*time.Minute, func() error { err := oa.CheckK8sAvailable(nil, nil) if err != nil { return err @@ -242,25 +235,13 @@ func main() { } return nil }) - if err != nil { - glog.Fatal(err) - } - err = fta.StartETCD(faultEtcd) - if err != nil { - glog.Fatal(err) - } + fta.StartETCDOrDie(faultEtcd) // stop one apiserver node and k8s/operator/tidbcluster is available - faultApiserver := selectNode(conf.APIServers) - apiserverPod, err := getApiserverPod(kubeCli, faultApiserver) - if err != nil { - glog.Fatal(err) - } - err = fta.StopKubeAPIServer(faultApiserver) - if err != nil { - glog.Fatal(err) - } - err = tests.Keep(3*time.Second, 10*time.Minute, func() error { + faultApiserver := tests.SelectNode(conf.APIServers) + apiserverPod := tests.GetApiserverPodOrDie(kubeCli, faultApiserver) + fta.StopKubeAPIServerOrDie(faultApiserver) + tests.KeepOrDie(3*time.Second, 10*time.Minute, func() error { err := oa.CheckK8sAvailable(nil, map[string]*corev1.Pod{apiserverPod.GetName(): apiserverPod}) if err != nil { return err @@ -275,34 +256,7 @@ func main() { } return nil }) - if err != nil { - glog.Fatal(err) - } - err = fta.StartKubeAPIServer(faultApiserver) - if err != nil { - glog.Fatal(err) - } + fta.StartKubeAPIServerOrDie(faultApiserver) glog.Infof("\nFinished.") } - -func selectNode(nodes []tests.Nodes) string { - rand.Seed(time.Now().Unix()) - index := rand.Intn(len(nodes)) - return nodes[index].Nodes[0] -} - -func getApiserverPod(kubeCli kubernetes.Interface, node string) (*corev1.Pod, error) { - selector := labels.Set(map[string]string{"component": "kube-apiserver"}).AsSelector() - options := metav1.ListOptions{LabelSelector: selector.String()} - apiserverPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) - if err != nil { - return nil, err - } - for _, apiserverPod := range apiserverPods.Items { - if apiserverPod.Spec.NodeName == node { - return &apiserverPod, nil - } - } - return nil, nil -} diff --git a/tests/failover.go b/tests/failover.go index ea1631909d4..8ca55c550d6 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -1,6 +1,7 @@ package tests import ( + "database/sql" "fmt" "sort" "strings" @@ -307,3 +308,162 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterConfig, component string) return nodeMap, nil } + +func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error { + return wait.Poll(3*time.Second, time.Minute, func() (bool, error) { + nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) + if err != nil { + glog.Errorf("failed to list nodes,error:%v", err) + return false, nil + } + for _, node := range nodes.Items { + if _, exist := excludeNodes[node.GetName()]; exist { + continue + } + if node.Status.Phase != corev1.NodeRunning { + return false, fmt.Errorf("node: [%s] is not in running", node.GetName()) + } + } + systemPods, err := oa.kubeCli.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) + if err != nil { + glog.Errorf("failed to list kube-system pods,error:%v", err) + return false, nil + } + for _, pod := range systemPods.Items { + if _, exist := excludePods[pod.GetName()]; exist { + continue + } + if GetPodStatus(&pod) != string(corev1.PodRunning) { + return false, fmt.Errorf("pod:[%s/%s] is unavailable", pod.GetName(), pod.GetNamespace()) + } + } + return true, nil + }) +} + +func (oa *operatorActions) CheckOperatorAvailable(operatorConfig *OperatorConfig) error { + return wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { + controllerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbControllerName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("failed to get deployment:%s failed,error:%v", tidbControllerName, err) + return false, nil + } + if controllerDeployment.Status.AvailableReplicas != *controllerDeployment.Spec.Replicas { + return false, fmt.Errorf("the %s is not available", tidbControllerName) + } + schedulerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbSchedulerName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("failed to get deployment:%s failed,error:%v", tidbSchedulerName, err) + return false, nil + } + if schedulerDeployment.Status.AvailableReplicas != *schedulerDeployment.Spec.Replicas { + return false, fmt.Errorf("the %s is not available", tidbSchedulerName) + } + return true, nil + }) +} + +func (oa *operatorActions) CheckTidbClustersAvailable(infos []*TidbClusterConfig) error { + return wait.Poll(3*time.Second, 30*time.Second, func() (bool, error) { + for _, info := range infos { + succ, err := oa.addDataToCluster(info) + if err != nil { + return false, err + } + if !succ { + return false, nil + } + } + return true, nil + }) + +} + +var testTableName = "testTable" + +func (op *operatorActions) addDataToCluster(info *TidbClusterConfig) (bool, error) { + db, err := sql.Open("mysql", getDSN(info.Namespace, info.ClusterName, "test", info.Password)) + if err != nil { + glog.Infof("cluster:[%s] can't open connection to mysql: %v", info.FullName(), err) + return false, nil + } + defer db.Close() + + _, err = db.Exec(fmt.Sprintf("CREATE TABLE %s (name VARCHAR(64))", testTableName)) + if err != nil && !tableAlreadyExist(err) { + glog.Infof("cluster:[%s] can't create table to mysql: %v", info.FullName(), err) + return false, nil + } + + _, err = db.Exec(fmt.Sprintf("INSERT INTO %s VALUES (?)", testTableName), "testValue") + if err != nil { + glog.Infof("cluster:[%s] can't insert data to mysql: %v", info.FullName(), err) + return false, nil + } + + return true, nil +} + +func GetPodStatus(pod *corev1.Pod) string { + reason := string(pod.Status.Phase) + if pod.Status.Reason != "" { + reason = pod.Status.Reason + } + + initializing := false + for i := range pod.Status.InitContainerStatuses { + container := pod.Status.InitContainerStatuses[i] + switch { + case container.State.Terminated != nil && container.State.Terminated.ExitCode == 0: + continue + case container.State.Terminated != nil: + // initialization is failed + if len(container.State.Terminated.Reason) == 0 { + if container.State.Terminated.Signal != 0 { + reason = fmt.Sprintf("Init:Signal:%d", container.State.Terminated.Signal) + } else { + reason = fmt.Sprintf("Init:ExitCode:%d", container.State.Terminated.ExitCode) + } + } else { + reason = "Init:" + container.State.Terminated.Reason + } + initializing = true + case container.State.Waiting != nil && len(container.State.Waiting.Reason) > 0 && container.State.Waiting.Reason != "PodInitializing": + reason = "Init:" + container.State.Waiting.Reason + initializing = true + default: + reason = fmt.Sprintf("Init:%d/%d", i, len(pod.Spec.InitContainers)) + initializing = true + } + break + } + if !initializing { + for i := len(pod.Status.ContainerStatuses) - 1; i >= 0; i-- { + container := pod.Status.ContainerStatuses[i] + + if container.State.Waiting != nil && container.State.Waiting.Reason != "" { + reason = container.State.Waiting.Reason + } else if container.State.Terminated != nil && container.State.Terminated.Reason != "" { + reason = container.State.Terminated.Reason + } else if container.State.Terminated != nil && container.State.Terminated.Reason == "" { + if container.State.Terminated.Signal != 0 { + reason = fmt.Sprintf("Signal:%d", container.State.Terminated.Signal) + } else { + reason = fmt.Sprintf("ExitCode:%d", container.State.Terminated.ExitCode) + } + } + } + } + + if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason { + reason = "Unknown" + } else if pod.DeletionTimestamp != nil { + reason = "Terminating" + } + + return reason +} + +func tableAlreadyExist(err error) bool { + return strings.Contains(err.Error(), "already exists") +} diff --git a/tests/fault.go b/tests/fault.go index 4f8e75bea9b..d24bdae7ff5 100644 --- a/tests/fault.go +++ b/tests/fault.go @@ -28,11 +28,15 @@ type FaultTriggerActions interface { StartNode(physicalNode string, node string) error StartNodeOrDie(physicalNode string, node string) StopETCD(nodes ...string) error + StopETCDOrDie(nodes ...string) StartETCD(nodes ...string) error + StartETCDOrDie(nodes ...string) StopKubelet(node string) error StartKubelet(node string) error StopKubeAPIServer(node string) error + StopKubeAPIServerOrDie(node string) StartKubeAPIServer(node string) error + StartKubeAPIServerOrDie(node string) StopKubeControllerManager(node string) error StartKubeControllerManager(node string) error StopKubeScheduler(node string) error @@ -156,6 +160,12 @@ func (fa *faultTriggerActions) StopETCD(nodes ...string) error { return nil } +func (fa *faultTriggerActions) StopETCDOrDie(nodes ...string) { + if err := fa.StopETCD(nodes...); err != nil { + panic(err) + } +} + // StartETCD starts the etcd service. // If the `nodes` is empty, StartETCD will start all etcd service. func (fa *faultTriggerActions) StartETCD(nodes ...string) error { @@ -174,6 +184,12 @@ func (fa *faultTriggerActions) StartETCD(nodes ...string) error { return nil } +func (fa *faultTriggerActions) StartETCDOrDie(nodes ...string) { + if err := fa.StartETCD(nodes...); err != nil { + panic(err) + } +} + // StopKubelet stops the kubelet service. func (fa *faultTriggerActions) StopKubelet(node string) error { return fa.serviceAction(node, manager.KubeletService, stopAction) @@ -219,11 +235,23 @@ func (fa *faultTriggerActions) StopKubeAPIServer(node string) error { return fa.serviceAction(node, manager.KubeAPIServerService, stopAction) } +func (fa *faultTriggerActions) StopKubeAPIServerOrDie(node string) { + if err := fa.StopKubeAPIServer(node); err != nil { + panic(err) + } +} + // StartKubeAPIServer starts the apiserver service. func (fa *faultTriggerActions) StartKubeAPIServer(node string) error { return fa.serviceAction(node, manager.KubeAPIServerService, startAction) } +func (fa *faultTriggerActions) StartKubeAPIServerOrDie(node string) { + if err := fa.StartKubeAPIServer(node); err != nil { + panic(err) + } +} + func (fa *faultTriggerActions) serviceAction(node string, serverName string, action string) error { faultCli := client.NewClient(client.Config{ Addr: fa.genFaultTriggerAddr(node), diff --git a/tests/util.go b/tests/util.go index 6e5e56cc268..de7c55cdb05 100644 --- a/tests/util.go +++ b/tests/util.go @@ -14,34 +14,18 @@ package tests import ( + "fmt" + "math/rand" "time" - "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" ) -func CreateKubeClient() (versioned.Interface, kubernetes.Interface, error) { - cfg, err := rest.InClusterConfig() - if err != nil { - return nil, nil, err - } - cfg.Timeout = 30 * time.Second - operatorCli, err := versioned.NewForConfig(cfg) - if err != nil { - return nil, nil, err - } - - kubeCli, err := kubernetes.NewForConfig(cfg) - if err != nil { - return nil, nil, err - } - - return operatorCli, kubeCli, nil -} - // Keep will keep the fun running in the period, otherwise the fun return error -func Keep(interval time.Duration, period time.Duration, fun func() error) error { +func KeepOrDie(interval time.Duration, period time.Duration, fun func() error) { timeline := time.Now().Add(period) for { if time.Now().After(timeline) { @@ -49,9 +33,29 @@ func Keep(interval time.Duration, period time.Duration, fun func() error) error } err := fun() if err != nil { - return err + panic(err) } time.Sleep(interval) } - return nil +} + +func SelectNode(nodes []Nodes) string { + rand.Seed(time.Now().Unix()) + index := rand.Intn(len(nodes)) + return nodes[index].Nodes[0] +} + +func GetApiserverPodOrDie(kubeCli kubernetes.Interface, node string) *corev1.Pod { + selector := labels.Set(map[string]string{"component": "kube-apiserver"}).AsSelector() + options := metav1.ListOptions{LabelSelector: selector.String()} + apiserverPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) + if err != nil { + panic(err) + } + for _, apiserverPod := range apiserverPods.Items { + if apiserverPod.Spec.NodeName == node { + return &apiserverPod + } + } + panic(fmt.Errorf("can't find apiserver in node:%s", node)) } From 88caef673c416c6c7a3391d1e54ea282bfad03af Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 11 Apr 2019 11:51:32 +0800 Subject: [PATCH 06/16] add controller fault --- tests/failover.go | 6 ++++-- tests/util.go | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/failover.go b/tests/failover.go index 8ca55c550d6..4af2193b47e 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -320,8 +320,10 @@ func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Nod if _, exist := excludeNodes[node.GetName()]; exist { continue } - if node.Status.Phase != corev1.NodeRunning { - return false, fmt.Errorf("node: [%s] is not in running", node.GetName()) + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue { + return false, fmt.Errorf("node: [%s] is not in running", node.GetName()) + } } } systemPods, err := oa.kubeCli.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) diff --git a/tests/util.go b/tests/util.go index de7c55cdb05..5d8888ecb73 100644 --- a/tests/util.go +++ b/tests/util.go @@ -59,3 +59,18 @@ func GetApiserverPodOrDie(kubeCli kubernetes.Interface, node string) *corev1.Pod } panic(fmt.Errorf("can't find apiserver in node:%s", node)) } + +func GetControllerManagerPodOrDie(kubeCli kubernetes.Interface, node string) *corev1.Pod { + selector := labels.Set(map[string]string{"component": "kube-controller-manager"}).AsSelector() + options := metav1.ListOptions{LabelSelector: selector.String()} + apiserverPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) + if err != nil { + panic(err) + } + for _, apiserverPod := range apiserverPods.Items { + if apiserverPod.Spec.NodeName == node { + return &apiserverPod + } + } + panic(fmt.Errorf("can't find controller-manager in node:%s", node)) +} From 23355dcf7790201302abdac64c9da25359369a01 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 19 Apr 2019 14:17:09 +0800 Subject: [PATCH 07/16] fix some bug --- tests/actions.go | 2 ++ tests/cmd/stability/main.go | 39 ++++----------------- tests/failover.go | 68 +++++++++++++++++++++++++++++++++++-- tests/util.go | 34 +++++++++++++++++++ 4 files changed, 107 insertions(+), 36 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index e7bee5aea0f..ab0ad023d0f 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -121,6 +121,8 @@ type OperatorActions interface { CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error CheckOperatorAvailable(operatorConfig *OperatorConfig) error CheckTidbClustersAvailable(infos []*TidbClusterConfig) error + CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) + CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) } type operatorActions struct { diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 825c07669eb..c779140a81d 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -19,14 +19,14 @@ import ( _ "net/http/pprof" "time" + "github.com/pingcap/tidb-operator/tests/backup" + "github.com/golang/glog" "github.com/jinzhu/copier" "github.com/pingcap/tidb-operator/tests" - "github.com/pingcap/tidb-operator/tests/backup" "github.com/pingcap/tidb-operator/tests/pkg/client" - corev1 "k8s.io/api/core/v1" "k8s.io/apiserver/pkg/util/logs" ) @@ -42,6 +42,7 @@ func main() { oa := tests.NewOperatorActions(cli, kubeCli, conf) fta := tests.NewFaultTriggerAction(cli, kubeCli, conf) fta.CheckAndRecoverEnvOrDie() + oa.CheckK8sAvailable(nil, nil) tidbVersion := conf.GetTiDBVersionOrDie() upgardeTiDBVersions := conf.GetUpgradeTidbVersionsOrDie() @@ -225,42 +226,14 @@ func main() { faultEtcd := tests.SelectNode(conf.ETCDs) fta.StopETCDOrDie(faultEtcd) defer fta.StartETCDOrDie(faultEtcd) - tests.KeepOrDie(3*time.Second, 10*time.Minute, func() error { - err := oa.CheckK8sAvailable(nil, nil) - if err != nil { - return err - } - err = oa.CheckOperatorAvailable(operatorCfg) - if err != nil { - return err - } - err = oa.CheckTidbClustersAvailable(allClusters) - if err != nil { - return err - } - return nil - }) + oa.CheckOneEtcdDownOrDie(operatorCfg, allClusters, faultEtcd) fta.StartETCDOrDie(faultEtcd) // stop one apiserver node and k8s/operator/tidbcluster is available faultApiserver := tests.SelectNode(conf.APIServers) - apiserverPod := tests.GetApiserverPodOrDie(kubeCli, faultApiserver) fta.StopKubeAPIServerOrDie(faultApiserver) - tests.KeepOrDie(3*time.Second, 10*time.Minute, func() error { - err := oa.CheckK8sAvailable(nil, map[string]*corev1.Pod{apiserverPod.GetName(): apiserverPod}) - if err != nil { - return err - } - err = oa.CheckOperatorAvailable(operatorCfg) - if err != nil { - return err - } - err = oa.CheckTidbClustersAvailable(allClusters) - if err != nil { - return err - } - return nil - }) + defer fta.StartKubeAPIServer(faultApiserver) + oa.CheckOneApiserverDownOrDie(operatorCfg, allClusters, faultApiserver) fta.StartKubeAPIServerOrDie(faultApiserver) glog.Infof("\nFinished.") diff --git a/tests/failover.go b/tests/failover.go index 63f91a749da..f72a259ddb1 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -447,6 +447,65 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterConfig, component string) return nodeMap, nil } +func (oa *operatorActions) CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { + KeepOrDie(3*time.Second, 10*time.Minute, func() error { + err := oa.CheckK8sAvailable(nil, nil) + if err != nil { + return err + } + glog.Infof("k8s cluster is available.") + err = oa.CheckOperatorAvailable(operatorConfig) + if err != nil { + return err + } + glog.Infof("tidb operator is available.") + err = oa.CheckTidbClustersAvailable(clusters) + if err != nil { + return err + } + glog.Infof("all clusters is available") + return nil + }) +} + +func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { + affectedPods := map[string]*corev1.Pod{} + apiserverPod := GetApiserverPodOrDie(oa.kubeCli, faultNode) + if apiserverPod != nil { + affectedPods[apiserverPod.GetName()] = apiserverPod + } + controllerPod := GetControllerManagerPodOrDie(oa.kubeCli, faultNode) + if controllerPod != nil { + affectedPods[controllerPod.GetName()] = controllerPod + } + schedulerPod := GetSchedulerPodOrDie(oa.kubeCli, faultNode) + if schedulerPod != nil { + affectedPods[schedulerPod.GetName()] = schedulerPod + } + dnsPod := GetDnsPodOrDie(oa.kubeCli, faultNode) + if dnsPod != nil { + affectedPods[dnsPod.GetName()] = dnsPod + } + KeepOrDie(3*time.Second, 10*time.Minute, func() error { + err := oa.CheckK8sAvailable(nil, affectedPods) + if err != nil { + return err + } + glog.Infof("k8s cluster is available.") + err = oa.CheckOperatorAvailable(operatorConfig) + if err != nil { + return err + } + glog.Infof("tidb operator is available.") + err = oa.CheckTidbClustersAvailable(clusters) + if err != nil { + return err + } + glog.Infof("all clusters is available") + return nil + }) +} + func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error { return wait.Poll(3*time.Second, time.Minute, func() (bool, error) { nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) @@ -460,7 +519,8 @@ func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Nod } for _, condition := range node.Status.Conditions { if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue { - return false, fmt.Errorf("node: [%s] is not in running", node.GetName()) + glog.Errorf("node: [%s] is not in running", node.GetName()) + return false, nil } } } @@ -473,8 +533,10 @@ func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Nod if _, exist := excludePods[pod.GetName()]; exist { continue } - if GetPodStatus(&pod) != string(corev1.PodRunning) { - return false, fmt.Errorf("pod:[%s/%s] is unavailable", pod.GetName(), pod.GetNamespace()) + podState := GetPodStatus(&pod) + if podState != string(corev1.PodRunning) { + glog.Errorf("pod:[%s/%s] is unavailable,state is %s", pod.GetName(), pod.GetNamespace(), podState) + return false, nil } } return true, nil diff --git a/tests/util.go b/tests/util.go index 5d8888ecb73..527af1953ec 100644 --- a/tests/util.go +++ b/tests/util.go @@ -18,6 +18,8 @@ import ( "math/rand" "time" + "github.com/golang/glog" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -60,6 +62,38 @@ func GetApiserverPodOrDie(kubeCli kubernetes.Interface, node string) *corev1.Pod panic(fmt.Errorf("can't find apiserver in node:%s", node)) } +func GetSchedulerPodOrDie(kubeCli kubernetes.Interface, node string) *corev1.Pod { + selector := labels.Set(map[string]string{"component": "kube-scheduler"}).AsSelector() + options := metav1.ListOptions{LabelSelector: selector.String()} + apiserverPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) + if err != nil { + panic(err) + } + for _, apiserverPod := range apiserverPods.Items { + if apiserverPod.Spec.NodeName == node { + return &apiserverPod + } + } + glog.Infof("can't find scheduler in node:%s", node) + return nil +} + +func GetDnsPodOrDie(kubeCli kubernetes.Interface, node string) *corev1.Pod { + selector := labels.Set(map[string]string{"component": "kube-dns"}).AsSelector() + options := metav1.ListOptions{LabelSelector: selector.String()} + apiserverPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) + if err != nil { + panic(err) + } + for _, apiserverPod := range apiserverPods.Items { + if apiserverPod.Spec.NodeName == node { + return &apiserverPod + } + } + glog.Infof("can't find dns in node:%s", node) + return nil +} + func GetControllerManagerPodOrDie(kubeCli kubernetes.Interface, node string) *corev1.Pod { selector := labels.Set(map[string]string{"component": "kube-controller-manager"}).AsSelector() options := metav1.ListOptions{LabelSelector: selector.String()} From 7ff1546c5a97c6bf15a44b93813e985387aeb49d Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 19 Apr 2019 14:34:23 +0800 Subject: [PATCH 08/16] fix --- tests/failover.go | 20 +++++++++++--- tests/util.go | 70 ++++++++++++----------------------------------- 2 files changed, 34 insertions(+), 56 deletions(-) diff --git a/tests/failover.go b/tests/failover.go index f72a259ddb1..bea7acc541a 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -470,19 +470,31 @@ func (oa *operatorActions) CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { affectedPods := map[string]*corev1.Pod{} - apiserverPod := GetApiserverPodOrDie(oa.kubeCli, faultNode) + apiserverPod, err := GetApiserverPod(oa.kubeCli, faultNode) + if err != nil { + panic(fmt.Errorf("can't find apiserver in node:%s", faultNode)) + } if apiserverPod != nil { affectedPods[apiserverPod.GetName()] = apiserverPod } - controllerPod := GetControllerManagerPodOrDie(oa.kubeCli, faultNode) + controllerPod, err := GetControllerManagerPod(oa.kubeCli, faultNode) + if err != nil { + glog.Infof("can't find controllerManager in node:%s", faultNode) + } if controllerPod != nil { affectedPods[controllerPod.GetName()] = controllerPod } - schedulerPod := GetSchedulerPodOrDie(oa.kubeCli, faultNode) + schedulerPod, err := GetSchedulerPod(oa.kubeCli, faultNode) + if err != nil { + glog.Infof("can't find scheduler in node:%s", faultNode) + } if schedulerPod != nil { affectedPods[schedulerPod.GetName()] = schedulerPod } - dnsPod := GetDnsPodOrDie(oa.kubeCli, faultNode) + dnsPod, err := GetDnsPod(oa.kubeCli, faultNode) + if err != nil { + panic(fmt.Errorf("can't find controller-manager in node:%s", faultNode)) + } if dnsPod != nil { affectedPods[dnsPod.GetName()] = dnsPod } diff --git a/tests/util.go b/tests/util.go index 527af1953ec..44c10acb997 100644 --- a/tests/util.go +++ b/tests/util.go @@ -14,12 +14,9 @@ package tests import ( - "fmt" "math/rand" "time" - "github.com/golang/glog" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -47,64 +44,33 @@ func SelectNode(nodes []Nodes) string { return nodes[index].Nodes[0] } -func GetApiserverPodOrDie(kubeCli kubernetes.Interface, node string) *corev1.Pod { - selector := labels.Set(map[string]string{"component": "kube-apiserver"}).AsSelector() - options := metav1.ListOptions{LabelSelector: selector.String()} - apiserverPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) - if err != nil { - panic(err) - } - for _, apiserverPod := range apiserverPods.Items { - if apiserverPod.Spec.NodeName == node { - return &apiserverPod - } - } - panic(fmt.Errorf("can't find apiserver in node:%s", node)) +func GetApiserverPod(kubeCli kubernetes.Interface, node string) (*corev1.Pod, error) { + return GetKubeComponent(kubeCli, node, "kube-apiserver") } -func GetSchedulerPodOrDie(kubeCli kubernetes.Interface, node string) *corev1.Pod { - selector := labels.Set(map[string]string{"component": "kube-scheduler"}).AsSelector() - options := metav1.ListOptions{LabelSelector: selector.String()} - apiserverPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) - if err != nil { - panic(err) - } - for _, apiserverPod := range apiserverPods.Items { - if apiserverPod.Spec.NodeName == node { - return &apiserverPod - } - } - glog.Infof("can't find scheduler in node:%s", node) - return nil +func GetSchedulerPod(kubeCli kubernetes.Interface, node string) (*corev1.Pod, error) { + return GetKubeComponent(kubeCli, node, "kube-scheduler") } -func GetDnsPodOrDie(kubeCli kubernetes.Interface, node string) *corev1.Pod { - selector := labels.Set(map[string]string{"component": "kube-dns"}).AsSelector() - options := metav1.ListOptions{LabelSelector: selector.String()} - apiserverPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) - if err != nil { - panic(err) - } - for _, apiserverPod := range apiserverPods.Items { - if apiserverPod.Spec.NodeName == node { - return &apiserverPod - } - } - glog.Infof("can't find dns in node:%s", node) - return nil +func GetDnsPod(kubeCli kubernetes.Interface, node string) (*corev1.Pod, error) { + return GetKubeComponent(kubeCli, node, "kube-dns") +} + +func GetControllerManagerPod(kubeCli kubernetes.Interface, node string) (*corev1.Pod, error) { + return GetKubeComponent(kubeCli, node, "kube-controller-manager") } -func GetControllerManagerPodOrDie(kubeCli kubernetes.Interface, node string) *corev1.Pod { - selector := labels.Set(map[string]string{"component": "kube-controller-manager"}).AsSelector() +func GetKubeComponent(kubeCli kubernetes.Interface, node string, componentName string) (*corev1.Pod, error) { + selector := labels.Set(map[string]string{"component": componentName}).AsSelector() options := metav1.ListOptions{LabelSelector: selector.String()} - apiserverPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) + componentPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) if err != nil { - panic(err) + return nil, err } - for _, apiserverPod := range apiserverPods.Items { - if apiserverPod.Spec.NodeName == node { - return &apiserverPod + for _, componentPod := range componentPods.Items { + if componentPod.Spec.NodeName == node { + return &componentPod, nil } } - panic(fmt.Errorf("can't find controller-manager in node:%s", node)) + return nil, nil } From cf6231ff41656ca95dc6ad5b90a4825e43ff337b Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 19 Apr 2019 19:34:49 +0800 Subject: [PATCH 09/16] address comment --- tests/actions.go | 1 + tests/cmd/stability/main.go | 5 ++++- tests/failover.go | 6 ++++++ tests/util.go | 4 +++- 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index ab0ad023d0f..73ab0dceb64 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -119,6 +119,7 @@ type OperatorActions interface { CheckRecover(cluster *TidbClusterConfig) (bool, error) CheckRecoverOrDie(clusters []*TidbClusterConfig) CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error + CheckK8sAvailableOrDie(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) CheckOperatorAvailable(operatorConfig *OperatorConfig) error CheckTidbClustersAvailable(infos []*TidbClusterConfig) error CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index c779140a81d..628e6ba6cfa 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -42,7 +42,7 @@ func main() { oa := tests.NewOperatorActions(cli, kubeCli, conf) fta := tests.NewFaultTriggerAction(cli, kubeCli, conf) fta.CheckAndRecoverEnvOrDie() - oa.CheckK8sAvailable(nil, nil) + oa.CheckK8sAvailableOrDie(nil, nil) tidbVersion := conf.GetTiDBVersionOrDie() upgardeTiDBVersions := conf.GetUpgradeTidbVersionsOrDie() @@ -226,6 +226,8 @@ func main() { faultEtcd := tests.SelectNode(conf.ETCDs) fta.StopETCDOrDie(faultEtcd) defer fta.StartETCDOrDie(faultEtcd) + // TODO make the pause interval as a argument + time.Sleep(3 * time.Minute) oa.CheckOneEtcdDownOrDie(operatorCfg, allClusters, faultEtcd) fta.StartETCDOrDie(faultEtcd) @@ -233,6 +235,7 @@ func main() { faultApiserver := tests.SelectNode(conf.APIServers) fta.StopKubeAPIServerOrDie(faultApiserver) defer fta.StartKubeAPIServer(faultApiserver) + time.Sleep(3 * time.Minute) oa.CheckOneApiserverDownOrDie(operatorCfg, allClusters, faultApiserver) fta.StartKubeAPIServerOrDie(faultApiserver) diff --git a/tests/failover.go b/tests/failover.go index bea7acc541a..87bfcb9f556 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -518,6 +518,12 @@ func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorCo }) } +func (oa *operatorActions) CheckK8sAvailableOrDie(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) { + if err := oa.CheckK8sAvailable(excludeNodes, excludePods); err != nil { + panic(err) + } +} + func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error { return wait.Poll(3*time.Second, time.Minute, func() (bool, error) { nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) diff --git a/tests/util.go b/tests/util.go index 44c10acb997..2531ea00d82 100644 --- a/tests/util.go +++ b/tests/util.go @@ -41,7 +41,9 @@ func KeepOrDie(interval time.Duration, period time.Duration, fun func() error) { func SelectNode(nodes []Nodes) string { rand.Seed(time.Now().Unix()) index := rand.Intn(len(nodes)) - return nodes[index].Nodes[0] + vmNodes := nodes[index].Nodes + index2 := rand.Intn(len(vmNodes)) + return vmNodes[index2] } func GetApiserverPod(kubeCli kubernetes.Interface, node string) (*corev1.Pod, error) { From 67a70c1eaa19d9e7eba377beca5c86824fec176a Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 24 Apr 2019 11:44:43 +0800 Subject: [PATCH 10/16] fix --- tests/actions.go | 4 +- tests/cmd/stability/main.go | 4 +- tests/failover.go | 82 +++++++++++++++++++++++++------------ 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 55f7a9a243f..d1fdc895ba1 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -120,8 +120,8 @@ type OperatorActions interface { GetNodeMap(info *TidbClusterConfig, component string) (map[string][]string, error) TruncateSSTFileThenCheckFailover(info *TidbClusterConfig, tikvFailoverPeriod time.Duration) error TruncateSSTFileThenCheckFailoverOrDie(info *TidbClusterConfig, tikvFailoverPeriod time.Duration) - CheckFailoverPending(info *TidbClusterConfig, faultPoint *time.Time) (bool, error) - CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, faultPoint *time.Time) + CheckFailoverPending(info *TidbClusterConfig, node string, faultPoint *time.Time) (bool, error) + CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, node string, faultPoint *time.Time) CheckFailover(info *TidbClusterConfig, faultNode string) (bool, error) CheckFailoverOrDie(clusters []*TidbClusterConfig, faultNode string) CheckRecover(cluster *TidbClusterConfig) (bool, error) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 4bb461448dc..cd95d06477d 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -213,7 +213,7 @@ func main() { // stop a node and failover automatically physicalNode, node, faultTime := fta.StopNodeOrDie() - oa.CheckFailoverPendingOrDie(allClusters, &faultTime) + oa.CheckFailoverPendingOrDie(allClusters, node, &faultTime) oa.CheckFailoverOrDie(allClusters, node) time.Sleep(3 * time.Minute) fta.StartNodeOrDie(physicalNode, node) @@ -223,7 +223,7 @@ func main() { } // truncate a sst file and check failover - oa.TruncateSSTFileThenCheckFailoverOrDie(cluster1, 5*time.Minute) + //oa.TruncateSSTFileThenCheckFailoverOrDie(cluster1, 5*time.Minute) // stop one etcd node and k8s/operator/tidbcluster is available faultEtcd := tests.SelectNode(conf.ETCDs) diff --git a/tests/failover.go b/tests/failover.go index 87bfcb9f556..431ed7e9520 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -91,7 +91,7 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon } // restart tikv to ensure sst files - err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", 1, syscall.SIGTERM) + err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", 1, syscall.SIGKILL) if err != nil { glog.Errorf("kill tikv: pod=%s err=%s", store.PodName, err.Error()) return err @@ -154,7 +154,12 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailoverOrDie(info *TidbClust } } -func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, faultPoint *time.Time) (bool, error) { +func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, node string, faultPoint *time.Time) (bool, error) { + affectedPods, err := oa.getPodsByNode(info, node) + if err != nil { + glog.Infof("cluster:[%s] query pods failed,error:%v", info.FullName(), err) + return false, nil + } tc, err := oa.cli.PingcapV1alpha1().TidbClusters(info.Namespace).Get(info.ClusterName, metav1.GetOptions{}) if err != nil { glog.Infof("pending failover,failed to get tidbcluster:[%s], error: %v", info.FullName(), err) @@ -168,19 +173,32 @@ func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, faultPo deadline := faultPoint.Add(period) if time.Now().Before(deadline) { if tc.Status.PD.FailureMembers != nil && len(tc.Status.PD.FailureMembers) > 0 { - err := fmt.Errorf("cluster: [%s] the pd member should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) - glog.Errorf(err.Error()) - return false, err + for _, failureMember := range tc.Status.PD.FailureMembers { + if _, exist := affectedPods[failureMember.PodName]; exist { + err := fmt.Errorf("cluster: [%s] the pd member[%s] should be mark failure after %s", info.FullName(), failureMember.PodName, deadline.Format(time.RFC3339)) + glog.Errorf(err.Error()) + return false, err + } + } } if tc.Status.TiKV.FailureStores != nil && len(tc.Status.TiKV.FailureStores) > 0 { - err := fmt.Errorf("cluster: [%s] the tikv store should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) - glog.Errorf(err.Error()) - return false, err + for _, failureStore := range tc.Status.TiKV.FailureStores { + if _, exist := affectedPods[failureStore.PodName]; exist { + err := fmt.Errorf("cluster: [%s] the tikv store[%s] should be mark failure after %s", info.FullName(), failureStore.PodName, deadline.Format(time.RFC3339)) + glog.Errorf(err.Error()) + return false, err + } + } + } if tc.Status.TiDB.FailureMembers != nil && len(tc.Status.TiDB.FailureMembers) > 0 { - err := fmt.Errorf("cluster: [%s] the tidb member should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) - glog.Errorf(err.Error()) - return false, err + for _, failureMember := range tc.Status.TiDB.FailureMembers { + if _, exist := affectedPods[failureMember.PodName]; exist { + err := fmt.Errorf("cluster: [%s] the tidb member[%s] should be mark failure after %s", info.FullName(), failureMember.PodName, deadline.Format(time.RFC3339)) + glog.Errorf(err.Error()) + return false, err + } + } } glog.Infof("cluster: [%s] operator's failover feature is pending", info.FullName()) @@ -189,11 +207,11 @@ func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, faultPo return true, nil } -func (oa *operatorActions) CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, faultPoint *time.Time) { +func (oa *operatorActions) CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, node string, faultPoint *time.Time) { if err := wait.Poll(1*time.Minute, 30*time.Minute, func() (bool, error) { var passes []bool for i := range clusters { - pass, err := oa.CheckFailoverPending(clusters[i], faultPoint) + pass, err := oa.CheckFailoverPending(clusters[i], node, faultPoint) if err != nil { return pass, err } @@ -211,23 +229,12 @@ func (oa *operatorActions) CheckFailoverPendingOrDie(clusters []*TidbClusterConf } func (oa *operatorActions) CheckFailover(info *TidbClusterConfig, node string) (bool, error) { - selector, err := label.New().Instance(info.ClusterName).Selector() - if err != nil { - glog.Errorf("cluster:[%s] create selector failed, error:%v", info.FullName(), err) - return false, nil - } - pods, err := oa.kubeCli.CoreV1().Pods(info.Namespace).List(metav1.ListOptions{LabelSelector: selector.String()}) + affectedPods, err := oa.getPodsByNode(info, node) if err != nil { - glog.Errorf("cluster:[%s] query pods failed, error:%v", info.FullName(), err) + glog.Infof("cluster:[%s] query pods failed,error:%v", info.FullName(), err) return false, nil } - affectedPods := map[string]*corev1.Pod{} - for i, pod := range pods.Items { - if pod.Spec.NodeName == node { - affectedPods[pod.Name] = &pods.Items[i] - } - } if len(affectedPods) == 0 { glog.Infof("the cluster:[%s] can not be affected by node:[%s]", info.FullName(), node) return true, nil @@ -260,6 +267,27 @@ func (oa *operatorActions) CheckFailover(info *TidbClusterConfig, node string) ( return true, nil } +func (oa *operatorActions) getPodsByNode(info *TidbClusterConfig, node string) (map[string]*corev1.Pod, error) { + selector, err := label.New().Instance(info.ClusterName).Selector() + if err != nil { + glog.Errorf("cluster:[%s] create selector failed, error:%v", info.FullName(), err) + return nil, err + } + pods, err := oa.kubeCli.CoreV1().Pods(info.Namespace).List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + glog.Errorf("cluster:[%s] query pods failed, error:%v", info.FullName(), err) + return nil, err + } + podsOfNode := map[string]*corev1.Pod{} + for i, pod := range pods.Items { + if pod.Spec.NodeName == node { + podsOfNode[pod.Name] = &pods.Items[i] + } + } + + return podsOfNode, nil +} + func (oa *operatorActions) CheckFailoverOrDie(clusters []*TidbClusterConfig, faultNode string) { if err := wait.Poll(1*time.Minute, 30*time.Minute, func() (bool, error) { var passes []bool @@ -525,7 +553,7 @@ func (oa *operatorActions) CheckK8sAvailableOrDie(excludeNodes map[string]*corev } func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error { - return wait.Poll(3*time.Second, time.Minute, func() (bool, error) { + return wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) if err != nil { glog.Errorf("failed to list nodes,error:%v", err) From 30afceef5f65dc59e5bc5e20590a96a492a83b64 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 26 Apr 2019 17:20:57 +0800 Subject: [PATCH 11/16] fix bugs --- tests/actions.go | 38 +++++++------ tests/cmd/stability/main.go | 4 +- tests/failover.go | 85 ++++++++++++++-------------- tests/fault.go | 2 +- tests/pkg/blockwriter/blockwriter.go | 12 ++-- tests/pkg/webhook/pods.go | 2 +- 6 files changed, 71 insertions(+), 72 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 7db1f69a871..8b894f6da5c 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -419,14 +419,14 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { pollFn := func() (bool, error) { if res, err := exec.Command("kubectl", "get", "po", "--output=name", "-n", info.Namespace, "-l", setStr). CombinedOutput(); err != nil || len(res) != 0 { - glog.V(4).Infof("waiting for tidbcluster: %s/%s pods deleting, %v, [%s]", + glog.Infof("waiting for tidbcluster: %s/%s pods deleting, %v, [%s]", info.Namespace, info.ClusterName, err, string(res)) return false, nil } - pvCmd := fmt.Sprintf("kubectl get pv -l %s=%s,%s=%s 2>/dev/null|grep Released", - label.NamespaceLabelKey, info.Namespace, label.InstanceLabelKey, info.ClusterName) - glog.V(4).Info(pvCmd) + pvCmd := fmt.Sprintf("kubectl get pv | grep %s | grep %s 2>/dev/null|grep Released", + info.Namespace, info.ClusterName) + glog.Info(pvCmd) if res, err := exec.Command("/bin/sh", "-c", pvCmd). CombinedOutput(); len(res) == 0 { } else if err != nil { @@ -465,35 +465,35 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error return false, nil } - glog.V(4).Infof("check tidb cluster begin tidbMembersReadyFn") + glog.Infof("check tidb cluster begin tidbMembersReadyFn") if b, err := oa.tidbMembersReadyFn(tc); !b && err == nil { return false, nil } - glog.V(4).Infof("check tidb cluster begin reclaimPolicySyncFn") + glog.Infof("check tidb cluster begin reclaimPolicySyncFn") if b, err := oa.reclaimPolicySyncFn(tc); !b && err == nil { return false, nil } - glog.V(4).Infof("check tidb cluster begin metaSyncFn") + glog.Infof("check tidb cluster begin metaSyncFn") if b, err := oa.metaSyncFn(tc); err != nil { return false, err } else if !b && err == nil { return false, nil } - glog.V(4).Infof("check tidb cluster begin schedulerHAFn") + glog.Infof("check tidb cluster begin schedulerHAFn") if b, err := oa.schedulerHAFn(tc); !b && err == nil { return false, nil } - glog.V(4).Infof("check tidb cluster begin passwordIsSet") + glog.Infof("check tidb cluster begin passwordIsSet") if b, err := oa.passwordIsSet(info); !b && err == nil { return false, nil } if info.Monitor { - glog.V(4).Infof("check tidb monitor normal") + glog.Infof("check tidb monitor normal") if b, err := oa.monitorNormal(info); !b && err == nil { return false, nil } @@ -1405,11 +1405,12 @@ func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterConfig) error { glog.Infof("begin to deploy adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) sets := map[string]string{ - "name": info.BackupName, - "mode": "backup", - "user": "root", - "password": info.Password, - "storage.size": "10Gi", + "name": info.BackupName, + "mode": "backup", + "user": "root", + "password": info.Password, + "storage.size": "10Gi", + "backupOptions": "'--chunk-filesize=100 --threads=1'", } setString := info.BackupHelmSetString(sets) @@ -1446,7 +1447,7 @@ func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterConfig) error { err := wait.Poll(DefaultPollInterval, BackupAndRestorePollTimeOut, fn) if err != nil { - return fmt.Errorf("failed to launch scheduler backup job: %v", err) + return fmt.Errorf("failed to launch backup job: %v", err) } return nil @@ -1487,7 +1488,7 @@ func (oa *operatorActions) CheckRestore(from *TidbClusterConfig, to *TidbCluster return false, nil } if job.Status.Succeeded == 0 { - glog.Errorf("cluster [%s] back up job is not completed, please wait! ", to.ClusterName) + glog.Errorf("cluster [%s] restore job is not completed, please wait! ", to.ClusterName) return false, nil } @@ -1513,7 +1514,7 @@ func (oa *operatorActions) CheckRestore(from *TidbClusterConfig, to *TidbCluster err := wait.Poll(DefaultPollInterval, BackupAndRestorePollTimeOut, fn) if err != nil { - return fmt.Errorf("failed to launch scheduler backup job: %v", err) + return fmt.Errorf("failed to launch restore job: %v", err) } return nil } @@ -1608,6 +1609,7 @@ func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterConfig) error "scheduledBackup.schedule": cron, "scheduledBackup.storage": "10Gi", "scheduledBackup.secretName": info.BackupSecretName, + "scheduledBackup.options": "'--chunk-filesize=100 --threads=1'", } setString := info.TidbClusterHelmSetString(sets) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index c8dce9b1aca..e8bca705862 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -34,12 +34,12 @@ func main() { logs.InitLogs() defer logs.FlushLogs() go func() { - glog.Info(http.ListenAndServe("localhost:6060", nil)) + glog.Info(http.ListenAndServe(":6060", nil)) }() conf := tests.ParseConfigOrDie() cli, kubeCli := client.NewCliOrDie() - oa := tests.NewOperatorActions(cli, kubeCli, tests.DefaultPollTimeout, conf) + oa := tests.NewOperatorActions(cli, kubeCli, tests.DefaultPollInterval, conf) fta := tests.NewFaultTriggerAction(cli, kubeCli, conf) fta.CheckAndRecoverEnvOrDie() oa.CheckK8sAvailableOrDie(nil, nil) diff --git a/tests/failover.go b/tests/failover.go index b77894a268a..257624f0baa 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -14,7 +14,6 @@ import ( "github.com/pingcap/tidb-operator/pkg/label" "github.com/pingcap/tidb-operator/tests/pkg/client" "github.com/pingcap/tidb-operator/tests/pkg/ops" - "github.com/pingcap/tidb-operator/tests/pkg/util" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -73,48 +72,48 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon } // checkout pod status - podBeforeRestart, err := cli.CoreV1().Pods(info.Namespace).Get(store.PodName, metav1.GetOptions{}) - if err != nil { - glog.Errorf("failed to get target pod: pod=%s err=%s", store.PodName, err.Error()) - return err - } - - var rc int32 - if c := util.GetContainerStatusFromPod(podBeforeRestart, func(status corev1.ContainerStatus) bool { - return status.Name == "tikv" - }); c != nil { - rc = c.RestartCount - } else { - glog.Errorf("failed to get container status from tikv pod") - return errors.New("failed to get container status from tikv pod") - } - - // restart tikv to ensure sst files - err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", "tikv-server") - if err != nil { - glog.Errorf("kill tikv: pod=%s err=%s", store.PodName, err.Error()) - return err - } - - err = tikvOps.PollPod(info.Namespace, store.PodName, - func(pod *corev1.Pod, err error) (bool, error) { - if pod == nil { - glog.Warningf("pod is nil: err=%s", err.Error()) - return false, nil - } - tikv := util.GetContainerStatusFromPod(pod, func(status corev1.ContainerStatus) bool { - return status.Name == "tikv" - }) - - if pod.Status.Phase == corev1.PodRunning && tikv != nil && tikv.RestartCount > rc { - return true, nil - } - return false, nil - }) - if err != nil { - glog.Errorf("tikv process hasn't been restarted: err=%s", err.Error()) - return err - } + //podBeforeRestart, err := cli.CoreV1().Pods(info.Namespace).Get(store.PodName, metav1.GetOptions{}) + //if err != nil { + // glog.Errorf("failed to get target pod: pod=%s err=%s", store.PodName, err.Error()) + // return err + //} + + //var rc int32 + //if c := util.GetContainerStatusFromPod(podBeforeRestart, func(status corev1.ContainerStatus) bool { + // return status.Name == "tikv" + //}); c != nil { + // rc = c.RestartCount + //} else { + // glog.Errorf("failed to get container status from tikv pod") + // return errors.New("failed to get container status from tikv pod") + //} + + //// restart tikv to ensure sst files + //err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", "tikv-server") + //if err != nil { + // glog.Errorf("kill tikv: pod=%s err=%s", store.PodName, err.Error()) + // return err + //} + + //err = tikvOps.PollPod(info.Namespace, store.PodName, + // func(pod *corev1.Pod, err error) (bool, error) { + // if pod == nil { + // glog.Warningf("pod is nil: err=%s", err.Error()) + // return false, nil + // } + // tikv := util.GetContainerStatusFromPod(pod, func(status corev1.ContainerStatus) bool { + // return status.Name == "tikv" + // }) + + // if pod.Status.Phase == corev1.PodRunning && tikv != nil && tikv.RestartCount > rc { + // return true, nil + // } + // return false, nil + // }) + //if err != nil { + // glog.Errorf("cluster:[%s]'s tikv[%s] process hasn't been restarted: err=%s", info.ClusterName, store.PodName, err.Error()) + // return err + //} // truncate the sst file and wait for failover err = tikvOps.TruncateSSTFile(ops.TruncateOptions{ diff --git a/tests/fault.go b/tests/fault.go index 6718043e398..e904846c117 100644 --- a/tests/fault.go +++ b/tests/fault.go @@ -182,7 +182,7 @@ func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error return err } - glog.Infof("node %s on physical node %s is started", physicalNode, node) + glog.Infof("node %s on physical node %s is started", node, physicalNode) return nil } diff --git a/tests/pkg/blockwriter/blockwriter.go b/tests/pkg/blockwriter/blockwriter.go index 978c391a23b..6e6fb5af374 100644 --- a/tests/pkg/blockwriter/blockwriter.go +++ b/tests/pkg/blockwriter/blockwriter.go @@ -29,7 +29,7 @@ import ( ) const ( - queryChanSize int = 10000 + queryChanSize int = 100 ) // BlockWriterCase is for concurrent writing blocks. @@ -123,13 +123,11 @@ func (c *BlockWriterCase) generateQuery(ctx context.Context, queryChan chan []st select { case <-ctx.Done(): return + case queryChan <- querys: + continue default: - if len(queryChan) < queryChanSize { - queryChan <- querys - } else { - glog.V(4).Infof("[%s] [%s] [action: generate Query] query channel is full, sleep 10 seconds", c, c.ClusterName) - util.Sleep(ctx, 10*time.Second) - } + glog.V(4).Infof("[%s] [%s] [action: generate Query] query channel is full, sleep 10 seconds", c, c.ClusterName) + util.Sleep(ctx, 10*time.Second) } } } diff --git a/tests/pkg/webhook/pods.go b/tests/pkg/webhook/pods.go index 0e9e3fd395e..0de3057ec37 100644 --- a/tests/pkg/webhook/pods.go +++ b/tests/pkg/webhook/pods.go @@ -45,7 +45,7 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { return &reviewResponse } - glog.Infof("delete pod %s", pod.Labels["app.kubernetes.io/component"]) + glog.Infof("delete %s pod [%s]", pod.Labels["app.kubernetes.io/component"], pod.GetName()) if pod.Labels["app.kubernetes.io/component"] == "tidb" { podIP := pod.Status.PodIP From e145ba463c1e4609ce35860cf6d033dc27f06149 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Sun, 28 Apr 2019 15:44:40 +0800 Subject: [PATCH 12/16] fix some bugs --- tests/actions.go | 4 ++-- tests/cmd/stability/main.go | 8 -------- tests/failover.go | 6 +++--- tests/fault.go | 2 +- 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 8f5583b7e43..b35e3d3adc1 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -131,8 +131,8 @@ type OperatorActions interface { CheckFailoverOrDie(clusters []*TidbClusterConfig, faultNode string) CheckRecover(cluster *TidbClusterConfig) (bool, error) CheckRecoverOrDie(clusters []*TidbClusterConfig) - CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error - CheckK8sAvailableOrDie(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) + CheckK8sAvailable(excludeNodes map[string]string, excludePods map[string]*corev1.Pod) error + CheckK8sAvailableOrDie(excludeNodes map[string]string, excludePods map[string]*corev1.Pod) CheckOperatorAvailable(operatorConfig *OperatorConfig) error CheckTidbClustersAvailable(infos []*TidbClusterConfig) error CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index e8bca705862..3805ede6342 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -234,14 +234,6 @@ func main() { oa.CheckOneEtcdDownOrDie(operatorCfg, allClusters, faultEtcd) fta.StartETCDOrDie(faultEtcd) - // stop one apiserver node and k8s/operator/tidbcluster is available - faultApiserver := tests.SelectNode(conf.APIServers) - fta.StopKubeAPIServerOrDie(faultApiserver) - defer fta.StartKubeAPIServer(faultApiserver) - time.Sleep(3 * time.Minute) - oa.CheckOneApiserverDownOrDie(operatorCfg, allClusters, faultApiserver) - fta.StartKubeAPIServerOrDie(faultApiserver) - //clean temp dirs when stability success err := conf.CleanTempDirs() if err != nil { diff --git a/tests/failover.go b/tests/failover.go index 6ed23ef31ce..244dd0d589f 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -488,7 +488,7 @@ func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorCo affectedPods[dnsPod.GetName()] = dnsPod } KeepOrDie(3*time.Second, 10*time.Minute, func() error { - err := oa.CheckK8sAvailable(nil, affectedPods) + err := oa.CheckK8sAvailable(map[string]string{faultNode: faultNode}, affectedPods) if err != nil { return err } @@ -507,13 +507,13 @@ func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorCo }) } -func (oa *operatorActions) CheckK8sAvailableOrDie(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) { +func (oa *operatorActions) CheckK8sAvailableOrDie(excludeNodes map[string]string, excludePods map[string]*corev1.Pod) { if err := oa.CheckK8sAvailable(excludeNodes, excludePods); err != nil { panic(err) } } -func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error { +func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]string, excludePods map[string]*corev1.Pod) error { return wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) if err != nil { diff --git a/tests/fault.go b/tests/fault.go index e904846c117..0a98a55e381 100644 --- a/tests/fault.go +++ b/tests/fault.go @@ -351,7 +351,7 @@ func (fa *faultTriggerActions) serviceAction(node string, serverName string, act return err } - glog.V(4).Infof("%s %s %s successfully", action, serverName, node) + glog.Infof("%s %s %s successfully", action, serverName, node) return nil } From b1a15037194996876a5a3f096fbc90c5bcc8f712 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Mon, 29 Apr 2019 14:14:40 +0800 Subject: [PATCH 13/16] remove declaration import --- tests/actions.go | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index a7e53af7e1d..de0f420b771 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -33,7 +33,6 @@ import ( pingcapErrors "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/tidb-operator/tests/pkg/apimachinery" - "github.com/pingcap/tidb-operator/tests/pkg/webhook" admissionV1beta1 "k8s.io/api/admissionregistration/v1beta1" "k8s.io/api/apps/v1beta1" batchv1 "k8s.io/api/batch/v1" From 71d989e2367fb191706be12aeea9b440312e627d Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Tue, 30 Apr 2019 11:54:16 +0800 Subject: [PATCH 14/16] address comment --- tests/actions.go | 34 ++++++++++++++++------------------ tests/failover.go | 25 +++++++++++++------------ 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index de0f420b771..f3ca75d0d17 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -431,7 +431,7 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { patchPVCmd := fmt.Sprintf("kubectl get pv | grep %s | grep %s | awk '{print $1}' | "+ "xargs -I {} kubectl patch pv {} -p '{\"spec\":{\"persistentVolumeReclaimPolicy\":\"Delete\"}}'", info.Namespace, info.ClusterName) - glog.Info(patchPVCmd) + glog.V(4).Info(patchPVCmd) if res, err := exec.Command("/bin/sh", "-c", patchPVCmd).CombinedOutput(); err != nil { return fmt.Errorf("failed to patch pv: %v, %s", err, string(res)) } @@ -439,18 +439,18 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { pollFn := func() (bool, error) { if res, err := exec.Command("kubectl", "get", "po", "--output=name", "-n", info.Namespace, "-l", setStr). CombinedOutput(); err != nil || len(res) != 0 { - glog.Infof("waiting for tidbcluster: %s/%s pods deleting, %v, [%s]", + glog.V(4).Infof("waiting for tidbcluster: %s/%s pods deleting, %v, [%s]", info.Namespace, info.ClusterName, err, string(res)) return false, nil } pvCmd := fmt.Sprintf("kubectl get pv | grep %s | grep %s 2>/dev/null|grep Released", info.Namespace, info.ClusterName) - glog.Info(pvCmd) + glog.V(4).Info(pvCmd) if res, err := exec.Command("/bin/sh", "-c", pvCmd). CombinedOutput(); len(res) == 0 { } else if err != nil { - glog.Infof("waiting for tidbcluster: %s/%s pv deleting, %v, %s", + glog.V(4).Infof("waiting for tidbcluster: %s/%s pv deleting, %v, %s", info.Namespace, info.ClusterName, err, string(res)) return false, nil } @@ -485,42 +485,42 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error return false, nil } - glog.Infof("check tidb cluster begin tidbMembersReadyFn") + glog.V(4).Infof("check tidb cluster begin tidbMembersReadyFn") if b, err := oa.tidbMembersReadyFn(tc); !b && err == nil { return false, nil } - glog.Infof("check tidb cluster begin reclaimPolicySyncFn") + glog.V(4).Infof("check tidb cluster begin reclaimPolicySyncFn") if b, err := oa.reclaimPolicySyncFn(tc); !b && err == nil { return false, nil } - glog.Infof("check tidb cluster begin metaSyncFn") + glog.V(4).Infof("check tidb cluster begin metaSyncFn") if b, err := oa.metaSyncFn(tc); err != nil { return false, err } else if !b && err == nil { return false, nil } - glog.Infof("check tidb cluster begin schedulerHAFn") + glog.V(4).Infof("check tidb cluster begin schedulerHAFn") if b, err := oa.schedulerHAFn(tc); !b && err == nil { return false, nil } - glog.Infof("check tidb cluster begin passwordIsSet") + glog.V(4).Infof("check tidb cluster begin passwordIsSet") if b, err := oa.passwordIsSet(info); !b && err == nil { return false, nil } if info.Monitor { - glog.Infof("check tidb monitor normal") + glog.V(4).Infof("check tidb monitor normal") if b, err := oa.monitorNormal(info); !b && err == nil { return false, nil } } return true, nil }); err != nil { - glog.Infof("check tidb cluster status failed: %s", err.Error()) + glog.Errorf("check tidb cluster status failed: %s", err.Error()) return fmt.Errorf("failed to waiting for tidbcluster %s/%s ready in 30 minutes", ns, tcName) } @@ -1449,12 +1449,11 @@ func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterConfig) error { glog.Infof("begin to deploy adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) sets := map[string]string{ - "name": info.BackupName, - "mode": "backup", - "user": "root", - "password": info.Password, - "storage.size": "10Gi", - "backupOptions": "'--chunk-filesize=100 --threads=1'", + "name": info.BackupName, + "mode": "backup", + "user": "root", + "password": info.Password, + "storage.size": "10Gi", } setString := info.BackupHelmSetString(sets) @@ -1655,7 +1654,6 @@ func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterConfig) error "scheduledBackup.schedule": cron, "scheduledBackup.storage": "10Gi", "scheduledBackup.secretName": info.BackupSecretName, - "scheduledBackup.options": "'--chunk-filesize=100 --threads=1'", } setString := info.TidbClusterHelmSetString(sets) diff --git a/tests/failover.go b/tests/failover.go index 8295cf34805..a09629a3433 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -449,27 +449,29 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterConfig, component string) } func (oa *operatorActions) CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { + glog.Infof("check k8s/operator/tidbCluster status when one etcd down") KeepOrDie(3*time.Second, 10*time.Minute, func() error { err := oa.CheckK8sAvailable(nil, nil) if err != nil { return err } - glog.Infof("k8s cluster is available.") + glog.V(4).Infof("k8s cluster is available.") err = oa.CheckOperatorAvailable(operatorConfig) if err != nil { return err } - glog.Infof("tidb operator is available.") + glog.V(4).Infof("tidb operator is available.") err = oa.CheckTidbClustersAvailable(clusters) if err != nil { return err } - glog.Infof("all clusters is available") + glog.V(4).Infof("all clusters is available") return nil }) } func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { + glog.Infof("check k8s/operator/tidbCluster status when one apiserver down") affectedPods := map[string]*corev1.Pod{} apiserverPod, err := GetApiserverPod(oa.kubeCli, faultNode) if err != nil { @@ -504,17 +506,17 @@ func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorCo if err != nil { return err } - glog.Infof("k8s cluster is available.") + glog.V(4).Infof("k8s cluster is available.") err = oa.CheckOperatorAvailable(operatorConfig) if err != nil { return err } - glog.Infof("tidb operator is available.") + glog.V(4).Infof("tidb operator is available.") err = oa.CheckTidbClustersAvailable(clusters) if err != nil { return err } - glog.Infof("all clusters is available") + glog.V(4).Infof("all clusters is available") return nil }) } @@ -526,7 +528,7 @@ func (oa *operatorActions) CheckK8sAvailableOrDie(excludeNodes map[string]string } func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]string, excludePods map[string]*corev1.Pod) error { - return wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { + return wait.Poll(3*time.Second, time.Minute, func() (bool, error) { nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) if err != nil { glog.Errorf("failed to list nodes,error:%v", err) @@ -538,8 +540,7 @@ func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]string, exc } for _, condition := range node.Status.Conditions { if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue { - glog.Errorf("node: [%s] is not in running", node.GetName()) - return false, nil + return false, fmt.Errorf("node: [%s] is not in running", node.GetName()) } } } @@ -605,20 +606,20 @@ var testTableName = "testTable" func (op *operatorActions) addDataToCluster(info *TidbClusterConfig) (bool, error) { db, err := sql.Open("mysql", getDSN(info.Namespace, info.ClusterName, "test", info.Password)) if err != nil { - glog.Infof("cluster:[%s] can't open connection to mysql: %v", info.FullName(), err) + glog.Errorf("cluster:[%s] can't open connection to mysql: %v", info.FullName(), err) return false, nil } defer db.Close() _, err = db.Exec(fmt.Sprintf("CREATE TABLE %s (name VARCHAR(64))", testTableName)) if err != nil && !tableAlreadyExist(err) { - glog.Infof("cluster:[%s] can't create table to mysql: %v", info.FullName(), err) + glog.Errorf("cluster:[%s] can't create table to mysql: %v", info.FullName(), err) return false, nil } _, err = db.Exec(fmt.Sprintf("INSERT INTO %s VALUES (?)", testTableName), "testValue") if err != nil { - glog.Infof("cluster:[%s] can't insert data to mysql: %v", info.FullName(), err) + glog.Errorf("cluster:[%s] can't insert data to mysql: %v", info.FullName(), err) return false, nil } From 8dd2eba1c15ab90d4ae1b5505ef21954fed8d462 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Tue, 30 Apr 2019 14:07:11 +0800 Subject: [PATCH 15/16] address comment --- tests/cmd/stability/main.go | 4 +--- tests/failover.go | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 5cb7cde564b..c6d1880a277 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -19,12 +19,10 @@ import ( _ "net/http/pprof" "time" - "github.com/pingcap/tidb-operator/tests/backup" - "github.com/golang/glog" "github.com/jinzhu/copier" - "github.com/pingcap/tidb-operator/tests" + "github.com/pingcap/tidb-operator/tests/backup" "github.com/pingcap/tidb-operator/tests/pkg/client" "k8s.io/apiserver/pkg/util/logs" diff --git a/tests/failover.go b/tests/failover.go index a09629a3433..35e4521195e 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -555,8 +555,7 @@ func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]string, exc } podState := GetPodStatus(&pod) if podState != string(corev1.PodRunning) { - glog.Errorf("pod:[%s/%s] is unavailable,state is %s", pod.GetName(), pod.GetNamespace(), podState) - return false, nil + return false, fmt.Errorf("pod:[%s/%s] is unavailable,state is %s", pod.GetName(), pod.GetNamespace(), podState) } } return true, nil From cab507ce2f06420ba9f7305290d7364b6b0ff2df Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Tue, 30 Apr 2019 15:05:44 +0800 Subject: [PATCH 16/16] fix webhook pod delete event filter --- tests/pkg/webhook/pods.go | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tests/pkg/webhook/pods.go b/tests/pkg/webhook/pods.go index 0e8f22edafe..21c6cb56b73 100644 --- a/tests/pkg/webhook/pods.go +++ b/tests/pkg/webhook/pods.go @@ -95,15 +95,14 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { pdClient := controller.NewDefaultPDControl().GetPDClient(tc) tidbController := controller.NewDefaultTiDBControl() - if pod.Labels[label.ComponentLabelKey] == "tidb" { - - // if tidb pod is deleting, allow pod delete operation - if pod.DeletionTimestamp != nil { - glog.Infof("TIDB pod status is namespace %s name %s timestamp %s", namespace, name, pod.DeletionTimestamp) - reviewResponse.Allowed = true - return &reviewResponse - } + // if the pod is deleting, allow the pod delete operation + if pod.DeletionTimestamp != nil { + glog.Infof("pod:[%s/%s] status is timestamp %s", namespace, name, pod.DeletionTimestamp) + reviewResponse.Allowed = true + return &reviewResponse + } + if pod.Labels[label.ComponentLabelKey] == "tidb" { ordinal, err := strconv.ParseInt(strings.Split(name, "-")[len(strings.Split(name, "-"))-1], 10, 32) if err != nil { glog.Errorf("fail to convert string to int while deleting TiDB err %v", err) @@ -125,7 +124,6 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { } } else if pod.Labels[label.ComponentLabelKey] == "pd" { - leader, err := pdClient.GetPDLeader() if err != nil { glog.Errorf("fail to get pd leader %v", err) @@ -141,7 +139,6 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { } } else if pod.Labels[label.ComponentLabelKey] == "tikv" { - var storeID uint64 storeID = 0 for _, store := range tc.Status.TiKV.Stores {