Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add etcd and kube-apiserver faults #367

Merged
merged 24 commits into from
Apr 30, 2019
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ import (

const (
period = 5 * time.Minute

tidbControllerName string = "tidb-controller-manager"
tidbSchedulerName string = "tidb-scheduler"

// NodeUnreachablePodReason is defined in k8s.io/kubernetes/pkg/util/node
// but not in client-go and apimachinery, so we define it here
NodeUnreachablePodReason = "NodeLost"
)

func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, cfg *Config) OperatorActions {
Expand Down Expand Up @@ -111,6 +118,11 @@ type OperatorActions interface {
CheckFailoverOrDie(clusters []*TidbClusterConfig, faultNode string)
CheckRecover(cluster *TidbClusterConfig) (bool, error)
CheckRecoverOrDie(clusters []*TidbClusterConfig)
CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error
CheckOperatorAvailable(operatorConfig *OperatorConfig) error
CheckTidbClustersAvailable(infos []*TidbClusterConfig) error
CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
}

type operatorActions struct {
Expand Down
23 changes: 20 additions & 3 deletions tests/cmd/stability/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,15 @@ import (
_ "net/http/pprof"
"time"

"github.com/pingcap/tidb-operator/tests/backup"

"github.com/golang/glog"
"github.com/jinzhu/copier"
"github.com/pingcap/tidb-operator/tests/pkg/client"
"k8s.io/apiserver/pkg/util/logs"

"github.com/pingcap/tidb-operator/tests"
"github.com/pingcap/tidb-operator/tests/backup"
"github.com/pingcap/tidb-operator/tests/pkg/client"

"k8s.io/apiserver/pkg/util/logs"
)

func main() {
Expand All @@ -40,6 +42,7 @@ func main() {
oa := tests.NewOperatorActions(cli, kubeCli, conf)
fta := tests.NewFaultTriggerAction(cli, kubeCli, conf)
fta.CheckAndRecoverEnvOrDie()
oa.CheckK8sAvailable(nil, nil)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function has a return error


tidbVersion := conf.GetTiDBVersionOrDie()
upgardeTiDBVersions := conf.GetUpgradeTidbVersionsOrDie()
Expand Down Expand Up @@ -219,5 +222,19 @@ func main() {
// truncate a sst file and check failover
oa.TruncateSSTFileThenCheckFailoverOrDie(cluster1, 5*time.Minute)

// stop one etcd node and k8s/operator/tidbcluster is available
faultEtcd := tests.SelectNode(conf.ETCDs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this SelectNode method will select the first etcd forever:

fta.StopETCDOrDie(faultEtcd)
defer fta.StartETCDOrDie(faultEtcd)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

before CheckOneEtcdDownOrDie you should sleep about 30 minutes.

oa.CheckOneEtcdDownOrDie(operatorCfg, allClusters, faultEtcd)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should add other cases: stopping 2 etcds and stopping 3 etcds.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I will add these cases in next pr

fta.StartETCDOrDie(faultEtcd)

// stop one apiserver node and k8s/operator/tidbcluster is available
faultApiserver := tests.SelectNode(conf.APIServers)
fta.StopKubeAPIServerOrDie(faultApiserver)
defer fta.StartKubeAPIServer(faultApiserver)
oa.CheckOneApiserverDownOrDie(operatorCfg, allClusters, faultApiserver)
fta.StartKubeAPIServerOrDie(faultApiserver)

glog.Infof("\nFinished.")
}
236 changes: 236 additions & 0 deletions tests/failover.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package tests

import (
"database/sql"
"fmt"
"sort"
"strings"
Expand Down Expand Up @@ -445,3 +446,238 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterConfig, component string)

return nodeMap, nil
}

func (oa *operatorActions) CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) {
KeepOrDie(3*time.Second, 10*time.Minute, func() error {
err := oa.CheckK8sAvailable(nil, nil)
if err != nil {
return err
}
glog.Infof("k8s cluster is available.")
err = oa.CheckOperatorAvailable(operatorConfig)
if err != nil {
return err
}
glog.Infof("tidb operator is available.")
err = oa.CheckTidbClustersAvailable(clusters)
if err != nil {
return err
}
glog.Infof("all clusters is available")
return nil
})
}

func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) {
affectedPods := map[string]*corev1.Pod{}
apiserverPod, err := GetApiserverPod(oa.kubeCli, faultNode)
if err != nil {
panic(fmt.Errorf("can't find apiserver in node:%s", faultNode))
}
if apiserverPod != nil {
affectedPods[apiserverPod.GetName()] = apiserverPod
}
controllerPod, err := GetControllerManagerPod(oa.kubeCli, faultNode)
if err != nil {
glog.Infof("can't find controllerManager in node:%s", faultNode)
}
if controllerPod != nil {
affectedPods[controllerPod.GetName()] = controllerPod
}
schedulerPod, err := GetSchedulerPod(oa.kubeCli, faultNode)
if err != nil {
glog.Infof("can't find scheduler in node:%s", faultNode)
}
if schedulerPod != nil {
affectedPods[schedulerPod.GetName()] = schedulerPod
}
dnsPod, err := GetDnsPod(oa.kubeCli, faultNode)
if err != nil {
panic(fmt.Errorf("can't find controller-manager in node:%s", faultNode))
}
if dnsPod != nil {
affectedPods[dnsPod.GetName()] = dnsPod
}
KeepOrDie(3*time.Second, 10*time.Minute, func() error {
err := oa.CheckK8sAvailable(nil, affectedPods)
if err != nil {
return err
}
glog.Infof("k8s cluster is available.")
err = oa.CheckOperatorAvailable(operatorConfig)
if err != nil {
return err
}
glog.Infof("tidb operator is available.")
err = oa.CheckTidbClustersAvailable(clusters)
if err != nil {
return err
}
glog.Infof("all clusters is available")
return nil
})
}

func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error {
return wait.Poll(3*time.Second, time.Minute, func() (bool, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the default interval

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

default interval is too long for the case

nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{})
if err != nil {
glog.Errorf("failed to list nodes,error:%v", err)
return false, nil
}
for _, node := range nodes.Items {
if _, exist := excludeNodes[node.GetName()]; exist {
continue
}
for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue {
glog.Errorf("node: [%s] is not in running", node.GetName())
return false, nil
}
}
}
systemPods, err := oa.kubeCli.CoreV1().Pods("kube-system").List(metav1.ListOptions{})
if err != nil {
glog.Errorf("failed to list kube-system pods,error:%v", err)
return false, nil
}
for _, pod := range systemPods.Items {
if _, exist := excludePods[pod.GetName()]; exist {
continue
}
podState := GetPodStatus(&pod)
if podState != string(corev1.PodRunning) {
glog.Errorf("pod:[%s/%s] is unavailable,state is %s", pod.GetName(), pod.GetNamespace(), podState)
return false, nil
}
}
return true, nil
})
}

func (oa *operatorActions) CheckOperatorAvailable(operatorConfig *OperatorConfig) error {
return wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the default interval

controllerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbControllerName, metav1.GetOptions{})
if err != nil {
glog.Errorf("failed to get deployment:%s failed,error:%v", tidbControllerName, err)
return false, nil
}
if controllerDeployment.Status.AvailableReplicas != *controllerDeployment.Spec.Replicas {
return false, fmt.Errorf("the %s is not available", tidbControllerName)
}
schedulerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbSchedulerName, metav1.GetOptions{})
if err != nil {
glog.Errorf("failed to get deployment:%s failed,error:%v", tidbSchedulerName, err)
return false, nil
}
if schedulerDeployment.Status.AvailableReplicas != *schedulerDeployment.Spec.Replicas {
return false, fmt.Errorf("the %s is not available", tidbSchedulerName)
}
return true, nil
})
}

func (oa *operatorActions) CheckTidbClustersAvailable(infos []*TidbClusterConfig) error {
return wait.Poll(3*time.Second, 30*time.Second, func() (bool, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the default interval

for _, info := range infos {
succ, err := oa.addDataToCluster(info)
if err != nil {
return false, err
}
if !succ {
return false, nil
}
}
return true, nil
})

}

var testTableName = "testTable"

func (op *operatorActions) addDataToCluster(info *TidbClusterConfig) (bool, error) {
db, err := sql.Open("mysql", getDSN(info.Namespace, info.ClusterName, "test", info.Password))
if err != nil {
glog.Infof("cluster:[%s] can't open connection to mysql: %v", info.FullName(), err)
return false, nil
}
defer db.Close()

_, err = db.Exec(fmt.Sprintf("CREATE TABLE %s (name VARCHAR(64))", testTableName))
if err != nil && !tableAlreadyExist(err) {
glog.Infof("cluster:[%s] can't create table to mysql: %v", info.FullName(), err)
return false, nil
}

_, err = db.Exec(fmt.Sprintf("INSERT INTO %s VALUES (?)", testTableName), "testValue")
if err != nil {
glog.Infof("cluster:[%s] can't insert data to mysql: %v", info.FullName(), err)
return false, nil
}

return true, nil
}

func GetPodStatus(pod *corev1.Pod) string {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why use such a complicated function? just .status.phase is not ok?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, the .status.phase just is the pod phase, but not pod's real state

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do these codes come from?

reason := string(pod.Status.Phase)
if pod.Status.Reason != "" {
reason = pod.Status.Reason
}

initializing := false
for i := range pod.Status.InitContainerStatuses {
container := pod.Status.InitContainerStatuses[i]
switch {
case container.State.Terminated != nil && container.State.Terminated.ExitCode == 0:
continue
case container.State.Terminated != nil:
// initialization is failed
if len(container.State.Terminated.Reason) == 0 {
if container.State.Terminated.Signal != 0 {
reason = fmt.Sprintf("Init:Signal:%d", container.State.Terminated.Signal)
} else {
reason = fmt.Sprintf("Init:ExitCode:%d", container.State.Terminated.ExitCode)
}
} else {
reason = "Init:" + container.State.Terminated.Reason
}
initializing = true
case container.State.Waiting != nil && len(container.State.Waiting.Reason) > 0 && container.State.Waiting.Reason != "PodInitializing":
reason = "Init:" + container.State.Waiting.Reason
initializing = true
default:
reason = fmt.Sprintf("Init:%d/%d", i, len(pod.Spec.InitContainers))
initializing = true
}
break
}
if !initializing {
for i := len(pod.Status.ContainerStatuses) - 1; i >= 0; i-- {
container := pod.Status.ContainerStatuses[i]

if container.State.Waiting != nil && container.State.Waiting.Reason != "" {
reason = container.State.Waiting.Reason
} else if container.State.Terminated != nil && container.State.Terminated.Reason != "" {
reason = container.State.Terminated.Reason
} else if container.State.Terminated != nil && container.State.Terminated.Reason == "" {
if container.State.Terminated.Signal != 0 {
reason = fmt.Sprintf("Signal:%d", container.State.Terminated.Signal)
} else {
reason = fmt.Sprintf("ExitCode:%d", container.State.Terminated.ExitCode)
}
}
}
}

if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason {
reason = "Unknown"
} else if pod.DeletionTimestamp != nil {
reason = "Terminating"
}

return reason
}

func tableAlreadyExist(err error) bool {
return strings.Contains(err.Error(), "already exists")
}
28 changes: 28 additions & 0 deletions tests/fault.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,15 @@ type FaultTriggerActions interface {
StartNode(physicalNode string, node string) error
StartNodeOrDie(physicalNode string, node string)
StopETCD(nodes ...string) error
StopETCDOrDie(nodes ...string)
StartETCD(nodes ...string) error
StartETCDOrDie(nodes ...string)
StopKubelet(node string) error
StartKubelet(node string) error
StopKubeAPIServer(node string) error
StopKubeAPIServerOrDie(node string)
StartKubeAPIServer(node string) error
StartKubeAPIServerOrDie(node string)
StopKubeControllerManager(node string) error
StartKubeControllerManager(node string) error
StopKubeScheduler(node string) error
Expand Down Expand Up @@ -209,6 +213,12 @@ func (fa *faultTriggerActions) StopETCD(nodes ...string) error {
return nil
}

func (fa *faultTriggerActions) StopETCDOrDie(nodes ...string) {
if err := fa.StopETCD(nodes...); err != nil {
panic(err)
}
}

// StartETCD starts the etcd service.
// If the `nodes` is empty, StartETCD will start all etcd service.
func (fa *faultTriggerActions) StartETCD(nodes ...string) error {
Expand All @@ -227,6 +237,12 @@ func (fa *faultTriggerActions) StartETCD(nodes ...string) error {
return nil
}

func (fa *faultTriggerActions) StartETCDOrDie(nodes ...string) {
if err := fa.StartETCD(nodes...); err != nil {
panic(err)
}
}

// StopKubelet stops the kubelet service.
func (fa *faultTriggerActions) StopKubelet(node string) error {
return fa.serviceAction(node, manager.KubeletService, stopAction)
Expand Down Expand Up @@ -272,11 +288,23 @@ func (fa *faultTriggerActions) StopKubeAPIServer(node string) error {
return fa.serviceAction(node, manager.KubeAPIServerService, stopAction)
}

func (fa *faultTriggerActions) StopKubeAPIServerOrDie(node string) {
if err := fa.StopKubeAPIServer(node); err != nil {
panic(err)
}
}

// StartKubeAPIServer starts the apiserver service.
func (fa *faultTriggerActions) StartKubeAPIServer(node string) error {
return fa.serviceAction(node, manager.KubeAPIServerService, startAction)
}

func (fa *faultTriggerActions) StartKubeAPIServerOrDie(node string) {
if err := fa.StartKubeAPIServer(node); err != nil {
panic(err)
}
}

func (fa *faultTriggerActions) serviceAction(node string, serverName string, action string) error {
faultCli := client.NewClient(client.Config{
Addr: fa.genFaultTriggerAddr(node),
Expand Down
Loading