diff --git a/tests/actions.go b/tests/actions.go index 7bc88f4775..f3ca75d0d1 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -33,7 +33,6 @@ import ( pingcapErrors "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/tidb-operator/tests/pkg/apimachinery" - "github.com/pingcap/tidb-operator/tests/pkg/webhook" admissionV1beta1 "k8s.io/api/admissionregistration/v1beta1" "k8s.io/api/apps/v1beta1" batchv1 "k8s.io/api/batch/v1" @@ -57,6 +56,13 @@ import ( const ( period = 5 * time.Minute + + tidbControllerName string = "tidb-controller-manager" + tidbSchedulerName string = "tidb-scheduler" + + // NodeUnreachablePodReason is defined in k8s.io/kubernetes/pkg/util/node + // but not in client-go and apimachinery, so we define it here + NodeUnreachablePodReason = "NodeLost" ) func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, pollInterval time.Duration, cfg *Config) OperatorActions { @@ -70,16 +76,17 @@ func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, p } const ( - DefaultPollTimeout time.Duration = 10 * time.Minute - DefaultPollInterval time.Duration = 1 * time.Minute - getBackupDirPodName = "get-backup-dir" - grafanaUsername = "admin" - grafanaPassword = "admin" - operartorChartName = "tidb-operator" - tidbClusterChartName = "tidb-cluster" - backupChartName = "tidb-backup" - statbilityTestTag = "stability" - metricsPort = 8090 + DefaultPollTimeout time.Duration = 10 * time.Minute + DefaultPollInterval time.Duration = 1 * time.Minute + BackupAndRestorePollTimeOut time.Duration = 30 * time.Minute + getBackupDirPodName = "get-backup-dir" + grafanaUsername = "admin" + grafanaPassword = "admin" + operartorChartName = "tidb-operator" + tidbClusterChartName = "tidb-cluster" + backupChartName = "tidb-backup" + statbilityTestTag = "stability" + metricsPort = 8090 ) type OperatorActions interface { @@ -119,12 +126,18 @@ type OperatorActions interface { GetNodeMap(info *TidbClusterConfig, component string) (map[string][]string, error) TruncateSSTFileThenCheckFailover(info *TidbClusterConfig, tikvFailoverPeriod time.Duration) error TruncateSSTFileThenCheckFailoverOrDie(info *TidbClusterConfig, tikvFailoverPeriod time.Duration) - CheckFailoverPending(info *TidbClusterConfig, faultPoint *time.Time) (bool, error) - CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, faultPoint *time.Time) + CheckFailoverPending(info *TidbClusterConfig, node string, faultPoint *time.Time) (bool, error) + CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, node string, faultPoint *time.Time) CheckFailover(info *TidbClusterConfig, faultNode string) (bool, error) CheckFailoverOrDie(clusters []*TidbClusterConfig, faultNode string) CheckRecover(cluster *TidbClusterConfig) (bool, error) CheckRecoverOrDie(clusters []*TidbClusterConfig) + CheckK8sAvailable(excludeNodes map[string]string, excludePods map[string]*corev1.Pod) error + CheckK8sAvailableOrDie(excludeNodes map[string]string, excludePods map[string]*corev1.Pod) + CheckOperatorAvailable(operatorConfig *OperatorConfig) error + CheckTidbClustersAvailable(infos []*TidbClusterConfig) error + CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) + CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) RegisterWebHookAndService(info *OperatorConfig) error RegisterWebHookAndServiceOrDie(info *OperatorConfig) CleanWebHookAndService(info *OperatorConfig) error @@ -418,7 +431,7 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { patchPVCmd := fmt.Sprintf("kubectl get pv | grep %s | grep %s | awk '{print $1}' | "+ "xargs -I {} kubectl patch pv {} -p '{\"spec\":{\"persistentVolumeReclaimPolicy\":\"Delete\"}}'", info.Namespace, info.ClusterName) - glog.Info(patchPVCmd) + glog.V(4).Info(patchPVCmd) if res, err := exec.Command("/bin/sh", "-c", patchPVCmd).CombinedOutput(); err != nil { return fmt.Errorf("failed to patch pv: %v, %s", err, string(res)) } @@ -431,13 +444,13 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { return false, nil } - pvCmd := fmt.Sprintf("kubectl get pv -l %s=%s,%s=%s 2>/dev/null|grep Released", - label.NamespaceLabelKey, info.Namespace, label.InstanceLabelKey, info.ClusterName) + pvCmd := fmt.Sprintf("kubectl get pv | grep %s | grep %s 2>/dev/null|grep Released", + info.Namespace, info.ClusterName) glog.V(4).Info(pvCmd) if res, err := exec.Command("/bin/sh", "-c", pvCmd). CombinedOutput(); len(res) == 0 { } else if err != nil { - glog.Infof("waiting for tidbcluster: %s/%s pv deleting, %v, %s", + glog.V(4).Infof("waiting for tidbcluster: %s/%s pv deleting, %v, %s", info.Namespace, info.ClusterName, err, string(res)) return false, nil } @@ -507,7 +520,7 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error } return true, nil }); err != nil { - glog.Infof("check tidb cluster status failed: %s", err.Error()) + glog.Errorf("check tidb cluster status failed: %s", err.Error()) return fmt.Errorf("failed to waiting for tidbcluster %s/%s ready in 30 minutes", ns, tcName) } @@ -1475,9 +1488,9 @@ func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterConfig) error { return true, nil } - err := wait.Poll(oa.pollInterval, DefaultPollTimeout, fn) + err := wait.Poll(DefaultPollInterval, BackupAndRestorePollTimeOut, fn) if err != nil { - return fmt.Errorf("failed to launch scheduler backup job: %v", err) + return fmt.Errorf("failed to launch backup job: %v", err) } return nil @@ -1519,7 +1532,7 @@ func (oa *operatorActions) CheckRestore(from *TidbClusterConfig, to *TidbCluster return false, nil } if job.Status.Succeeded == 0 { - glog.Errorf("cluster [%s] back up job is not completed, please wait! ", to.ClusterName) + glog.Errorf("cluster [%s] restore job is not completed, please wait! ", to.ClusterName) return false, nil } @@ -1543,9 +1556,9 @@ func (oa *operatorActions) CheckRestore(from *TidbClusterConfig, to *TidbCluster return true, nil } - err := wait.Poll(oa.pollInterval, 30*time.Minute, fn) + err := wait.Poll(oa.pollInterval, BackupAndRestorePollTimeOut, fn) if err != nil { - return fmt.Errorf("failed to launch scheduler backup job: %v", err) + return fmt.Errorf("failed to launch restore job: %v", err) } return nil } @@ -1727,7 +1740,7 @@ func (oa *operatorActions) CheckScheduledBackup(info *TidbClusterConfig) error { return false, nil } - err := wait.Poll(oa.pollInterval, DefaultPollTimeout, fn) + err := wait.Poll(DefaultPollInterval, BackupAndRestorePollTimeOut, fn) if err != nil { return fmt.Errorf("failed to launch scheduler backup job: %v", err) } diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 633d80a1cc..c6d1880a27 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -21,11 +21,11 @@ import ( "github.com/golang/glog" "github.com/jinzhu/copier" - "github.com/pingcap/tidb-operator/tests/pkg/client" - "k8s.io/apiserver/pkg/util/logs" - "github.com/pingcap/tidb-operator/tests" "github.com/pingcap/tidb-operator/tests/backup" + "github.com/pingcap/tidb-operator/tests/pkg/client" + + "k8s.io/apiserver/pkg/util/logs" ) func main() { @@ -40,6 +40,7 @@ func main() { oa := tests.NewOperatorActions(cli, kubeCli, tests.DefaultPollInterval, conf) fta := tests.NewFaultTriggerAction(cli, kubeCli, conf) fta.CheckAndRecoverEnvOrDie() + oa.CheckK8sAvailableOrDie(nil, nil) tidbVersion := conf.GetTiDBVersionOrDie() upgardeTiDBVersions := conf.GetUpgradeTidbVersionsOrDie() @@ -210,7 +211,7 @@ func main() { // stop a node and failover automatically physicalNode, node, faultTime := fta.StopNodeOrDie() - oa.CheckFailoverPendingOrDie(allClusters, &faultTime) + oa.CheckFailoverPendingOrDie(allClusters, node, &faultTime) oa.CheckFailoverOrDie(allClusters, node) time.Sleep(3 * time.Minute) fta.StartNodeOrDie(physicalNode, node) @@ -222,6 +223,15 @@ func main() { // truncate a sst file and check failover oa.TruncateSSTFileThenCheckFailoverOrDie(cluster1, 5*time.Minute) + // stop one etcd node and k8s/operator/tidbcluster is available + faultEtcd := tests.SelectNode(conf.ETCDs) + fta.StopETCDOrDie(faultEtcd) + defer fta.StartETCDOrDie(faultEtcd) + // TODO make the pause interval as a argument + time.Sleep(3 * time.Minute) + oa.CheckOneEtcdDownOrDie(operatorCfg, allClusters, faultEtcd) + fta.StartETCDOrDie(faultEtcd) + //clean temp dirs when stability success err := conf.CleanTempDirs() if err != nil { diff --git a/tests/failover.go b/tests/failover.go index 8915d18519..35e4521195 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -1,6 +1,7 @@ package tests import ( + "database/sql" "fmt" "sort" "strings" @@ -69,6 +70,7 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon glog.Infof("truncate sst file target store: id=%s pod=%s", store.ID, store.PodName) } + oa.emitEvent(info, fmt.Sprintf("TruncateSSTFile: tikv: %s", store.PodName)) glog.Infof("deleting pod: [%s/%s] and wait 1 minute for the pod to terminate", info.Namespace, store.PodName) err = cli.CoreV1().Pods(info.Namespace).Delete(store.PodName, nil) if err != nil { @@ -88,11 +90,15 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon } oa.emitEvent(info, fmt.Sprintf("TruncateSSTFile: tikv: %s/%s", info.Namespace, store.PodName)) + // delete tikv pod glog.Infof("deleting pod: [%s/%s] again", info.Namespace, store.PodName) - err = cli.CoreV1().Pods(info.Namespace).Delete(store.PodName, nil) - if err != nil { - return err - } + wait.Poll(10*time.Second, time.Minute, func() (bool, error) { + err = oa.kubeCli.CoreV1().Pods(info.Namespace).Delete(store.PodName, &metav1.DeleteOptions{}) + if err != nil { + return false, nil + } + return true, nil + }) tikvOps.SetPoll(DefaultPollInterval, maxStoreDownTime+tikvFailoverPeriod+failoverTimeout) @@ -121,7 +127,12 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailoverOrDie(info *TidbClust } } -func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, faultPoint *time.Time) (bool, error) { +func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, node string, faultPoint *time.Time) (bool, error) { + affectedPods, err := oa.getPodsByNode(info, node) + if err != nil { + glog.Infof("cluster:[%s] query pods failed,error:%v", info.FullName(), err) + return false, nil + } tc, err := oa.cli.PingcapV1alpha1().TidbClusters(info.Namespace).Get(info.ClusterName, metav1.GetOptions{}) if err != nil { glog.Infof("pending failover,failed to get tidbcluster:[%s], error: %v", info.FullName(), err) @@ -135,19 +146,32 @@ func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, faultPo deadline := faultPoint.Add(period) if time.Now().Before(deadline) { if tc.Status.PD.FailureMembers != nil && len(tc.Status.PD.FailureMembers) > 0 { - err := fmt.Errorf("cluster: [%s] the pd member should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) - glog.Errorf(err.Error()) - return false, err + for _, failureMember := range tc.Status.PD.FailureMembers { + if _, exist := affectedPods[failureMember.PodName]; exist { + err := fmt.Errorf("cluster: [%s] the pd member[%s] should be mark failure after %s", info.FullName(), failureMember.PodName, deadline.Format(time.RFC3339)) + glog.Errorf(err.Error()) + return false, err + } + } } if tc.Status.TiKV.FailureStores != nil && len(tc.Status.TiKV.FailureStores) > 0 { - err := fmt.Errorf("cluster: [%s] the tikv store should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) - glog.Errorf(err.Error()) - return false, err + for _, failureStore := range tc.Status.TiKV.FailureStores { + if _, exist := affectedPods[failureStore.PodName]; exist { + err := fmt.Errorf("cluster: [%s] the tikv store[%s] should be mark failure after %s", info.FullName(), failureStore.PodName, deadline.Format(time.RFC3339)) + glog.Errorf(err.Error()) + return false, err + } + } + } if tc.Status.TiDB.FailureMembers != nil && len(tc.Status.TiDB.FailureMembers) > 0 { - err := fmt.Errorf("cluster: [%s] the tidb member should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) - glog.Errorf(err.Error()) - return false, err + for _, failureMember := range tc.Status.TiDB.FailureMembers { + if _, exist := affectedPods[failureMember.PodName]; exist { + err := fmt.Errorf("cluster: [%s] the tidb member[%s] should be mark failure after %s", info.FullName(), failureMember.PodName, deadline.Format(time.RFC3339)) + glog.Errorf(err.Error()) + return false, err + } + } } glog.Infof("cluster: [%s] operator's failover feature is pending", info.FullName()) @@ -156,11 +180,11 @@ func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, faultPo return true, nil } -func (oa *operatorActions) CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, faultPoint *time.Time) { +func (oa *operatorActions) CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, node string, faultPoint *time.Time) { if err := wait.Poll(1*time.Minute, 30*time.Minute, func() (bool, error) { var passes []bool for i := range clusters { - pass, err := oa.CheckFailoverPending(clusters[i], faultPoint) + pass, err := oa.CheckFailoverPending(clusters[i], node, faultPoint) if err != nil { return pass, err } @@ -178,23 +202,12 @@ func (oa *operatorActions) CheckFailoverPendingOrDie(clusters []*TidbClusterConf } func (oa *operatorActions) CheckFailover(info *TidbClusterConfig, node string) (bool, error) { - selector, err := label.New().Instance(info.ClusterName).Selector() - if err != nil { - glog.Errorf("cluster:[%s] create selector failed, error:%v", info.FullName(), err) - return false, nil - } - pods, err := oa.kubeCli.CoreV1().Pods(info.Namespace).List(metav1.ListOptions{LabelSelector: selector.String()}) + affectedPods, err := oa.getPodsByNode(info, node) if err != nil { - glog.Errorf("cluster:[%s] query pods failed, error:%v", info.FullName(), err) + glog.Infof("cluster:[%s] query pods failed,error:%v", info.FullName(), err) return false, nil } - affectedPods := map[string]*corev1.Pod{} - for i, pod := range pods.Items { - if pod.Spec.NodeName == node { - affectedPods[pod.Name] = &pods.Items[i] - } - } if len(affectedPods) == 0 { glog.Infof("the cluster:[%s] can not be affected by node:[%s]", info.FullName(), node) return true, nil @@ -227,6 +240,27 @@ func (oa *operatorActions) CheckFailover(info *TidbClusterConfig, node string) ( return true, nil } +func (oa *operatorActions) getPodsByNode(info *TidbClusterConfig, node string) (map[string]*corev1.Pod, error) { + selector, err := label.New().Instance(info.ClusterName).Selector() + if err != nil { + glog.Errorf("cluster:[%s] create selector failed, error:%v", info.FullName(), err) + return nil, err + } + pods, err := oa.kubeCli.CoreV1().Pods(info.Namespace).List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + glog.Errorf("cluster:[%s] query pods failed, error:%v", info.FullName(), err) + return nil, err + } + podsOfNode := map[string]*corev1.Pod{} + for i, pod := range pods.Items { + if pod.Spec.NodeName == node { + podsOfNode[pod.Name] = &pods.Items[i] + } + } + + return podsOfNode, nil +} + func (oa *operatorActions) CheckFailoverOrDie(clusters []*TidbClusterConfig, faultNode string) { if err := wait.Poll(1*time.Minute, 30*time.Minute, func() (bool, error) { var passes []bool @@ -413,3 +447,244 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterConfig, component string) return nodeMap, nil } + +func (oa *operatorActions) CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { + glog.Infof("check k8s/operator/tidbCluster status when one etcd down") + KeepOrDie(3*time.Second, 10*time.Minute, func() error { + err := oa.CheckK8sAvailable(nil, nil) + if err != nil { + return err + } + glog.V(4).Infof("k8s cluster is available.") + err = oa.CheckOperatorAvailable(operatorConfig) + if err != nil { + return err + } + glog.V(4).Infof("tidb operator is available.") + err = oa.CheckTidbClustersAvailable(clusters) + if err != nil { + return err + } + glog.V(4).Infof("all clusters is available") + return nil + }) +} + +func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { + glog.Infof("check k8s/operator/tidbCluster status when one apiserver down") + affectedPods := map[string]*corev1.Pod{} + apiserverPod, err := GetApiserverPod(oa.kubeCli, faultNode) + if err != nil { + panic(fmt.Errorf("can't find apiserver in node:%s", faultNode)) + } + if apiserverPod != nil { + affectedPods[apiserverPod.GetName()] = apiserverPod + } + controllerPod, err := GetControllerManagerPod(oa.kubeCli, faultNode) + if err != nil { + glog.Infof("can't find controllerManager in node:%s", faultNode) + } + if controllerPod != nil { + affectedPods[controllerPod.GetName()] = controllerPod + } + schedulerPod, err := GetSchedulerPod(oa.kubeCli, faultNode) + if err != nil { + glog.Infof("can't find scheduler in node:%s", faultNode) + } + if schedulerPod != nil { + affectedPods[schedulerPod.GetName()] = schedulerPod + } + dnsPod, err := GetDnsPod(oa.kubeCli, faultNode) + if err != nil { + panic(fmt.Errorf("can't find controller-manager in node:%s", faultNode)) + } + if dnsPod != nil { + affectedPods[dnsPod.GetName()] = dnsPod + } + KeepOrDie(3*time.Second, 10*time.Minute, func() error { + err := oa.CheckK8sAvailable(map[string]string{faultNode: faultNode}, affectedPods) + if err != nil { + return err + } + glog.V(4).Infof("k8s cluster is available.") + err = oa.CheckOperatorAvailable(operatorConfig) + if err != nil { + return err + } + glog.V(4).Infof("tidb operator is available.") + err = oa.CheckTidbClustersAvailable(clusters) + if err != nil { + return err + } + glog.V(4).Infof("all clusters is available") + return nil + }) +} + +func (oa *operatorActions) CheckK8sAvailableOrDie(excludeNodes map[string]string, excludePods map[string]*corev1.Pod) { + if err := oa.CheckK8sAvailable(excludeNodes, excludePods); err != nil { + panic(err) + } +} + +func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]string, excludePods map[string]*corev1.Pod) error { + return wait.Poll(3*time.Second, time.Minute, func() (bool, error) { + nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) + if err != nil { + glog.Errorf("failed to list nodes,error:%v", err) + return false, nil + } + for _, node := range nodes.Items { + if _, exist := excludeNodes[node.GetName()]; exist { + continue + } + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue { + return false, fmt.Errorf("node: [%s] is not in running", node.GetName()) + } + } + } + systemPods, err := oa.kubeCli.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) + if err != nil { + glog.Errorf("failed to list kube-system pods,error:%v", err) + return false, nil + } + for _, pod := range systemPods.Items { + if _, exist := excludePods[pod.GetName()]; exist { + continue + } + podState := GetPodStatus(&pod) + if podState != string(corev1.PodRunning) { + return false, fmt.Errorf("pod:[%s/%s] is unavailable,state is %s", pod.GetName(), pod.GetNamespace(), podState) + } + } + return true, nil + }) +} + +func (oa *operatorActions) CheckOperatorAvailable(operatorConfig *OperatorConfig) error { + return wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { + controllerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbControllerName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("failed to get deployment:%s failed,error:%v", tidbControllerName, err) + return false, nil + } + if controllerDeployment.Status.AvailableReplicas != *controllerDeployment.Spec.Replicas { + return false, fmt.Errorf("the %s is not available", tidbControllerName) + } + schedulerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbSchedulerName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("failed to get deployment:%s failed,error:%v", tidbSchedulerName, err) + return false, nil + } + if schedulerDeployment.Status.AvailableReplicas != *schedulerDeployment.Spec.Replicas { + return false, fmt.Errorf("the %s is not available", tidbSchedulerName) + } + return true, nil + }) +} + +func (oa *operatorActions) CheckTidbClustersAvailable(infos []*TidbClusterConfig) error { + return wait.Poll(3*time.Second, 30*time.Second, func() (bool, error) { + for _, info := range infos { + succ, err := oa.addDataToCluster(info) + if err != nil { + return false, err + } + if !succ { + return false, nil + } + } + return true, nil + }) + +} + +var testTableName = "testTable" + +func (op *operatorActions) addDataToCluster(info *TidbClusterConfig) (bool, error) { + db, err := sql.Open("mysql", getDSN(info.Namespace, info.ClusterName, "test", info.Password)) + if err != nil { + glog.Errorf("cluster:[%s] can't open connection to mysql: %v", info.FullName(), err) + return false, nil + } + defer db.Close() + + _, err = db.Exec(fmt.Sprintf("CREATE TABLE %s (name VARCHAR(64))", testTableName)) + if err != nil && !tableAlreadyExist(err) { + glog.Errorf("cluster:[%s] can't create table to mysql: %v", info.FullName(), err) + return false, nil + } + + _, err = db.Exec(fmt.Sprintf("INSERT INTO %s VALUES (?)", testTableName), "testValue") + if err != nil { + glog.Errorf("cluster:[%s] can't insert data to mysql: %v", info.FullName(), err) + return false, nil + } + + return true, nil +} + +func GetPodStatus(pod *corev1.Pod) string { + reason := string(pod.Status.Phase) + if pod.Status.Reason != "" { + reason = pod.Status.Reason + } + + initializing := false + for i := range pod.Status.InitContainerStatuses { + container := pod.Status.InitContainerStatuses[i] + switch { + case container.State.Terminated != nil && container.State.Terminated.ExitCode == 0: + continue + case container.State.Terminated != nil: + // initialization is failed + if len(container.State.Terminated.Reason) == 0 { + if container.State.Terminated.Signal != 0 { + reason = fmt.Sprintf("Init:Signal:%d", container.State.Terminated.Signal) + } else { + reason = fmt.Sprintf("Init:ExitCode:%d", container.State.Terminated.ExitCode) + } + } else { + reason = "Init:" + container.State.Terminated.Reason + } + initializing = true + case container.State.Waiting != nil && len(container.State.Waiting.Reason) > 0 && container.State.Waiting.Reason != "PodInitializing": + reason = "Init:" + container.State.Waiting.Reason + initializing = true + default: + reason = fmt.Sprintf("Init:%d/%d", i, len(pod.Spec.InitContainers)) + initializing = true + } + break + } + if !initializing { + for i := len(pod.Status.ContainerStatuses) - 1; i >= 0; i-- { + container := pod.Status.ContainerStatuses[i] + + if container.State.Waiting != nil && container.State.Waiting.Reason != "" { + reason = container.State.Waiting.Reason + } else if container.State.Terminated != nil && container.State.Terminated.Reason != "" { + reason = container.State.Terminated.Reason + } else if container.State.Terminated != nil && container.State.Terminated.Reason == "" { + if container.State.Terminated.Signal != 0 { + reason = fmt.Sprintf("Signal:%d", container.State.Terminated.Signal) + } else { + reason = fmt.Sprintf("ExitCode:%d", container.State.Terminated.ExitCode) + } + } + } + } + + if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason { + reason = "Unknown" + } else if pod.DeletionTimestamp != nil { + reason = "Terminating" + } + + return reason +} + +func tableAlreadyExist(err error) bool { + return strings.Contains(err.Error(), "already exists") +} diff --git a/tests/fault.go b/tests/fault.go index df6089fbbc..0a98a55e38 100644 --- a/tests/fault.go +++ b/tests/fault.go @@ -30,11 +30,15 @@ type FaultTriggerActions interface { StartNode(physicalNode string, node string) error StartNodeOrDie(physicalNode string, node string) StopETCD(nodes ...string) error + StopETCDOrDie(nodes ...string) StartETCD(nodes ...string) error + StartETCDOrDie(nodes ...string) StopKubelet(node string) error StartKubelet(node string) error StopKubeAPIServer(node string) error + StopKubeAPIServerOrDie(node string) StartKubeAPIServer(node string) error + StartKubeAPIServerOrDie(node string) StopKubeControllerManager(node string) error StartKubeControllerManager(node string) error StopKubeScheduler(node string) error @@ -178,7 +182,7 @@ func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error return err } - glog.Infof("node %s on physical node %s is started", physicalNode, node) + glog.Infof("node %s on physical node %s is started", node, physicalNode) return nil } @@ -207,6 +211,12 @@ func (fa *faultTriggerActions) StopETCD(nodes ...string) error { return nil } +func (fa *faultTriggerActions) StopETCDOrDie(nodes ...string) { + if err := fa.StopETCD(nodes...); err != nil { + panic(err) + } +} + // StartETCD starts the etcd service. // If the `nodes` is empty, StartETCD will start all etcd service. func (fa *faultTriggerActions) StartETCD(nodes ...string) error { @@ -225,6 +235,12 @@ func (fa *faultTriggerActions) StartETCD(nodes ...string) error { return nil } +func (fa *faultTriggerActions) StartETCDOrDie(nodes ...string) { + if err := fa.StartETCD(nodes...); err != nil { + panic(err) + } +} + // StopKubelet stops the kubelet service. func (fa *faultTriggerActions) StopKubelet(node string) error { return fa.serviceAction(node, manager.KubeletService, stopAction) @@ -270,11 +286,23 @@ func (fa *faultTriggerActions) StopKubeAPIServer(node string) error { return fa.serviceAction(node, manager.KubeAPIServerService, stopAction) } +func (fa *faultTriggerActions) StopKubeAPIServerOrDie(node string) { + if err := fa.StopKubeAPIServer(node); err != nil { + panic(err) + } +} + // StartKubeAPIServer starts the apiserver service. func (fa *faultTriggerActions) StartKubeAPIServer(node string) error { return fa.serviceAction(node, manager.KubeAPIServerService, startAction) } +func (fa *faultTriggerActions) StartKubeAPIServerOrDie(node string) { + if err := fa.StartKubeAPIServer(node); err != nil { + panic(err) + } +} + func (fa *faultTriggerActions) serviceAction(node string, serverName string, action string) error { faultCli := client.NewClient(client.Config{ Addr: fa.genFaultTriggerAddr(node), @@ -323,7 +351,7 @@ func (fa *faultTriggerActions) serviceAction(node string, serverName string, act return err } - glog.V(4).Infof("%s %s %s successfully", action, serverName, node) + glog.Infof("%s %s %s successfully", action, serverName, node) return nil } diff --git a/tests/pkg/blockwriter/blockwriter.go b/tests/pkg/blockwriter/blockwriter.go index 265823496c..6e6fb5af37 100644 --- a/tests/pkg/blockwriter/blockwriter.go +++ b/tests/pkg/blockwriter/blockwriter.go @@ -123,13 +123,11 @@ func (c *BlockWriterCase) generateQuery(ctx context.Context, queryChan chan []st select { case <-ctx.Done(): return + case queryChan <- querys: + continue default: - if len(queryChan) < queryChanSize { - queryChan <- querys - } else { - glog.V(4).Infof("[%s] [%s] [action: generate Query] query channel is full, sleep 10 seconds", c, c.ClusterName) - util.Sleep(ctx, 10*time.Second) - } + glog.V(4).Infof("[%s] [%s] [action: generate Query] query channel is full, sleep 10 seconds", c, c.ClusterName) + util.Sleep(ctx, 10*time.Second) } } } diff --git a/tests/pkg/webhook/pods.go b/tests/pkg/webhook/pods.go index 905d628ce3..21c6cb56b7 100644 --- a/tests/pkg/webhook/pods.go +++ b/tests/pkg/webhook/pods.go @@ -84,7 +84,7 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { return &reviewResponse } - glog.Infof("delete pod %s", pod.Labels[label.ComponentLabelKey]) + glog.Infof("delete %s pod [%s]", pod.Labels[label.ComponentLabelKey], pod.GetName()) tc, err := versionCli.PingcapV1alpha1().TidbClusters(namespace).Get(pod.Labels[label.InstanceLabelKey], metav1.GetOptions{}) if err != nil { @@ -95,15 +95,14 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { pdClient := controller.NewDefaultPDControl().GetPDClient(tc) tidbController := controller.NewDefaultTiDBControl() - if pod.Labels[label.ComponentLabelKey] == "tidb" { - - // if tidb pod is deleting, allow pod delete operation - if pod.DeletionTimestamp != nil { - glog.Infof("TIDB pod status is namespace %s name %s timestamp %s", namespace, name, pod.DeletionTimestamp) - reviewResponse.Allowed = true - return &reviewResponse - } + // if the pod is deleting, allow the pod delete operation + if pod.DeletionTimestamp != nil { + glog.Infof("pod:[%s/%s] status is timestamp %s", namespace, name, pod.DeletionTimestamp) + reviewResponse.Allowed = true + return &reviewResponse + } + if pod.Labels[label.ComponentLabelKey] == "tidb" { ordinal, err := strconv.ParseInt(strings.Split(name, "-")[len(strings.Split(name, "-"))-1], 10, 32) if err != nil { glog.Errorf("fail to convert string to int while deleting TiDB err %v", err) @@ -125,7 +124,6 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { } } else if pod.Labels[label.ComponentLabelKey] == "pd" { - leader, err := pdClient.GetPDLeader() if err != nil { glog.Errorf("fail to get pd leader %v", err) @@ -141,7 +139,6 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { } } else if pod.Labels[label.ComponentLabelKey] == "tikv" { - var storeID uint64 storeID = 0 for _, store := range tc.Status.TiKV.Stores { diff --git a/tests/util.go b/tests/util.go new file mode 100644 index 0000000000..2531ea00d8 --- /dev/null +++ b/tests/util.go @@ -0,0 +1,78 @@ +// Copyright 2018 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License.package spec + +package tests + +import ( + "math/rand" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/kubernetes" +) + +// Keep will keep the fun running in the period, otherwise the fun return error +func KeepOrDie(interval time.Duration, period time.Duration, fun func() error) { + timeline := time.Now().Add(period) + for { + if time.Now().After(timeline) { + break + } + err := fun() + if err != nil { + panic(err) + } + time.Sleep(interval) + } +} + +func SelectNode(nodes []Nodes) string { + rand.Seed(time.Now().Unix()) + index := rand.Intn(len(nodes)) + vmNodes := nodes[index].Nodes + index2 := rand.Intn(len(vmNodes)) + return vmNodes[index2] +} + +func GetApiserverPod(kubeCli kubernetes.Interface, node string) (*corev1.Pod, error) { + return GetKubeComponent(kubeCli, node, "kube-apiserver") +} + +func GetSchedulerPod(kubeCli kubernetes.Interface, node string) (*corev1.Pod, error) { + return GetKubeComponent(kubeCli, node, "kube-scheduler") +} + +func GetDnsPod(kubeCli kubernetes.Interface, node string) (*corev1.Pod, error) { + return GetKubeComponent(kubeCli, node, "kube-dns") +} + +func GetControllerManagerPod(kubeCli kubernetes.Interface, node string) (*corev1.Pod, error) { + return GetKubeComponent(kubeCli, node, "kube-controller-manager") +} + +func GetKubeComponent(kubeCli kubernetes.Interface, node string, componentName string) (*corev1.Pod, error) { + selector := labels.Set(map[string]string{"component": componentName}).AsSelector() + options := metav1.ListOptions{LabelSelector: selector.String()} + componentPods, err := kubeCli.CoreV1().Pods("kube-system").List(options) + if err != nil { + return nil, err + } + for _, componentPod := range componentPods.Items { + if componentPod.Spec.NodeName == node { + return &componentPod, nil + } + } + return nil, nil +}