-
Notifications
You must be signed in to change notification settings - Fork 500
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add etcd and kube-apiserver faults #367
Changes from 11 commits
f575537
c751128
f7fa6b4
b1dffb3
a86f17c
1110e2f
78a1395
88caef6
469c35e
23355dc
7ff1546
cf6231f
acf1742
67a70c1
e83dd17
30afcee
78e8b0f
e145ba4
52633d0
b48dab3
b1a1503
71d989e
8dd2eba
cab507c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -19,13 +19,15 @@ import ( | |||
_ "net/http/pprof" | ||||
"time" | ||||
|
||||
"github.com/pingcap/tidb-operator/tests/backup" | ||||
|
||||
"github.com/golang/glog" | ||||
"github.com/jinzhu/copier" | ||||
"github.com/pingcap/tidb-operator/tests/pkg/client" | ||||
"k8s.io/apiserver/pkg/util/logs" | ||||
|
||||
"github.com/pingcap/tidb-operator/tests" | ||||
"github.com/pingcap/tidb-operator/tests/backup" | ||||
"github.com/pingcap/tidb-operator/tests/pkg/client" | ||||
|
||||
"k8s.io/apiserver/pkg/util/logs" | ||||
) | ||||
|
||||
func main() { | ||||
|
@@ -40,6 +42,7 @@ func main() { | |||
oa := tests.NewOperatorActions(cli, kubeCli, conf) | ||||
fta := tests.NewFaultTriggerAction(cli, kubeCli, conf) | ||||
fta.CheckAndRecoverEnvOrDie() | ||||
oa.CheckK8sAvailable(nil, nil) | ||||
|
||||
tidbVersion := conf.GetTiDBVersionOrDie() | ||||
upgardeTiDBVersions := conf.GetUpgradeTidbVersionsOrDie() | ||||
|
@@ -219,5 +222,19 @@ func main() { | |||
// truncate a sst file and check failover | ||||
oa.TruncateSSTFileThenCheckFailoverOrDie(cluster1, 5*time.Minute) | ||||
|
||||
// stop one etcd node and k8s/operator/tidbcluster is available | ||||
faultEtcd := tests.SelectNode(conf.ETCDs) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this
|
||||
fta.StopETCDOrDie(faultEtcd) | ||||
defer fta.StartETCDOrDie(faultEtcd) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. before |
||||
oa.CheckOneEtcdDownOrDie(operatorCfg, allClusters, faultEtcd) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should add other cases: stopping 2 etcds and stopping 3 etcds. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I will add these cases in next pr |
||||
fta.StartETCDOrDie(faultEtcd) | ||||
|
||||
// stop one apiserver node and k8s/operator/tidbcluster is available | ||||
faultApiserver := tests.SelectNode(conf.APIServers) | ||||
fta.StopKubeAPIServerOrDie(faultApiserver) | ||||
defer fta.StartKubeAPIServer(faultApiserver) | ||||
oa.CheckOneApiserverDownOrDie(operatorCfg, allClusters, faultApiserver) | ||||
fta.StartKubeAPIServerOrDie(faultApiserver) | ||||
|
||||
glog.Infof("\nFinished.") | ||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
package tests | ||
|
||
import ( | ||
"database/sql" | ||
"fmt" | ||
"sort" | ||
"strings" | ||
|
@@ -445,3 +446,238 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterConfig, component string) | |
|
||
return nodeMap, nil | ||
} | ||
|
||
func (oa *operatorActions) CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { | ||
KeepOrDie(3*time.Second, 10*time.Minute, func() error { | ||
err := oa.CheckK8sAvailable(nil, nil) | ||
if err != nil { | ||
return err | ||
} | ||
glog.Infof("k8s cluster is available.") | ||
err = oa.CheckOperatorAvailable(operatorConfig) | ||
if err != nil { | ||
return err | ||
} | ||
glog.Infof("tidb operator is available.") | ||
err = oa.CheckTidbClustersAvailable(clusters) | ||
if err != nil { | ||
return err | ||
} | ||
glog.Infof("all clusters is available") | ||
return nil | ||
}) | ||
} | ||
|
||
func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { | ||
affectedPods := map[string]*corev1.Pod{} | ||
apiserverPod, err := GetApiserverPod(oa.kubeCli, faultNode) | ||
if err != nil { | ||
panic(fmt.Errorf("can't find apiserver in node:%s", faultNode)) | ||
} | ||
if apiserverPod != nil { | ||
affectedPods[apiserverPod.GetName()] = apiserverPod | ||
} | ||
controllerPod, err := GetControllerManagerPod(oa.kubeCli, faultNode) | ||
if err != nil { | ||
glog.Infof("can't find controllerManager in node:%s", faultNode) | ||
} | ||
if controllerPod != nil { | ||
affectedPods[controllerPod.GetName()] = controllerPod | ||
} | ||
schedulerPod, err := GetSchedulerPod(oa.kubeCli, faultNode) | ||
if err != nil { | ||
glog.Infof("can't find scheduler in node:%s", faultNode) | ||
} | ||
if schedulerPod != nil { | ||
affectedPods[schedulerPod.GetName()] = schedulerPod | ||
} | ||
dnsPod, err := GetDnsPod(oa.kubeCli, faultNode) | ||
if err != nil { | ||
panic(fmt.Errorf("can't find controller-manager in node:%s", faultNode)) | ||
} | ||
if dnsPod != nil { | ||
affectedPods[dnsPod.GetName()] = dnsPod | ||
} | ||
KeepOrDie(3*time.Second, 10*time.Minute, func() error { | ||
err := oa.CheckK8sAvailable(nil, affectedPods) | ||
if err != nil { | ||
return err | ||
} | ||
glog.Infof("k8s cluster is available.") | ||
err = oa.CheckOperatorAvailable(operatorConfig) | ||
if err != nil { | ||
return err | ||
} | ||
glog.Infof("tidb operator is available.") | ||
err = oa.CheckTidbClustersAvailable(clusters) | ||
if err != nil { | ||
return err | ||
} | ||
glog.Infof("all clusters is available") | ||
return nil | ||
}) | ||
} | ||
|
||
func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]*corev1.Node, excludePods map[string]*corev1.Pod) error { | ||
return wait.Poll(3*time.Second, time.Minute, func() (bool, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use the default interval There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. default interval is too long for the case |
||
nodes, err := oa.kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) | ||
if err != nil { | ||
glog.Errorf("failed to list nodes,error:%v", err) | ||
return false, nil | ||
} | ||
for _, node := range nodes.Items { | ||
if _, exist := excludeNodes[node.GetName()]; exist { | ||
continue | ||
} | ||
for _, condition := range node.Status.Conditions { | ||
if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue { | ||
glog.Errorf("node: [%s] is not in running", node.GetName()) | ||
return false, nil | ||
} | ||
} | ||
} | ||
systemPods, err := oa.kubeCli.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) | ||
if err != nil { | ||
glog.Errorf("failed to list kube-system pods,error:%v", err) | ||
return false, nil | ||
} | ||
for _, pod := range systemPods.Items { | ||
if _, exist := excludePods[pod.GetName()]; exist { | ||
continue | ||
} | ||
podState := GetPodStatus(&pod) | ||
if podState != string(corev1.PodRunning) { | ||
glog.Errorf("pod:[%s/%s] is unavailable,state is %s", pod.GetName(), pod.GetNamespace(), podState) | ||
return false, nil | ||
} | ||
} | ||
return true, nil | ||
}) | ||
} | ||
|
||
func (oa *operatorActions) CheckOperatorAvailable(operatorConfig *OperatorConfig) error { | ||
return wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use the default interval |
||
controllerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbControllerName, metav1.GetOptions{}) | ||
if err != nil { | ||
glog.Errorf("failed to get deployment:%s failed,error:%v", tidbControllerName, err) | ||
return false, nil | ||
} | ||
if controllerDeployment.Status.AvailableReplicas != *controllerDeployment.Spec.Replicas { | ||
return false, fmt.Errorf("the %s is not available", tidbControllerName) | ||
} | ||
schedulerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbSchedulerName, metav1.GetOptions{}) | ||
if err != nil { | ||
glog.Errorf("failed to get deployment:%s failed,error:%v", tidbSchedulerName, err) | ||
return false, nil | ||
} | ||
if schedulerDeployment.Status.AvailableReplicas != *schedulerDeployment.Spec.Replicas { | ||
return false, fmt.Errorf("the %s is not available", tidbSchedulerName) | ||
} | ||
return true, nil | ||
}) | ||
} | ||
|
||
func (oa *operatorActions) CheckTidbClustersAvailable(infos []*TidbClusterConfig) error { | ||
return wait.Poll(3*time.Second, 30*time.Second, func() (bool, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use the default interval |
||
for _, info := range infos { | ||
succ, err := oa.addDataToCluster(info) | ||
if err != nil { | ||
return false, err | ||
} | ||
if !succ { | ||
return false, nil | ||
} | ||
} | ||
return true, nil | ||
}) | ||
|
||
} | ||
|
||
var testTableName = "testTable" | ||
|
||
func (op *operatorActions) addDataToCluster(info *TidbClusterConfig) (bool, error) { | ||
db, err := sql.Open("mysql", getDSN(info.Namespace, info.ClusterName, "test", info.Password)) | ||
if err != nil { | ||
glog.Infof("cluster:[%s] can't open connection to mysql: %v", info.FullName(), err) | ||
return false, nil | ||
} | ||
defer db.Close() | ||
|
||
_, err = db.Exec(fmt.Sprintf("CREATE TABLE %s (name VARCHAR(64))", testTableName)) | ||
if err != nil && !tableAlreadyExist(err) { | ||
glog.Infof("cluster:[%s] can't create table to mysql: %v", info.FullName(), err) | ||
return false, nil | ||
} | ||
|
||
_, err = db.Exec(fmt.Sprintf("INSERT INTO %s VALUES (?)", testTableName), "testValue") | ||
if err != nil { | ||
glog.Infof("cluster:[%s] can't insert data to mysql: %v", info.FullName(), err) | ||
return false, nil | ||
} | ||
|
||
return true, nil | ||
} | ||
|
||
func GetPodStatus(pod *corev1.Pod) string { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why use such a complicated function? just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where do these codes come from? |
||
reason := string(pod.Status.Phase) | ||
if pod.Status.Reason != "" { | ||
reason = pod.Status.Reason | ||
} | ||
|
||
initializing := false | ||
for i := range pod.Status.InitContainerStatuses { | ||
container := pod.Status.InitContainerStatuses[i] | ||
switch { | ||
case container.State.Terminated != nil && container.State.Terminated.ExitCode == 0: | ||
continue | ||
case container.State.Terminated != nil: | ||
// initialization is failed | ||
if len(container.State.Terminated.Reason) == 0 { | ||
if container.State.Terminated.Signal != 0 { | ||
reason = fmt.Sprintf("Init:Signal:%d", container.State.Terminated.Signal) | ||
} else { | ||
reason = fmt.Sprintf("Init:ExitCode:%d", container.State.Terminated.ExitCode) | ||
} | ||
} else { | ||
reason = "Init:" + container.State.Terminated.Reason | ||
} | ||
initializing = true | ||
case container.State.Waiting != nil && len(container.State.Waiting.Reason) > 0 && container.State.Waiting.Reason != "PodInitializing": | ||
reason = "Init:" + container.State.Waiting.Reason | ||
initializing = true | ||
default: | ||
reason = fmt.Sprintf("Init:%d/%d", i, len(pod.Spec.InitContainers)) | ||
initializing = true | ||
} | ||
break | ||
} | ||
if !initializing { | ||
for i := len(pod.Status.ContainerStatuses) - 1; i >= 0; i-- { | ||
container := pod.Status.ContainerStatuses[i] | ||
|
||
if container.State.Waiting != nil && container.State.Waiting.Reason != "" { | ||
reason = container.State.Waiting.Reason | ||
} else if container.State.Terminated != nil && container.State.Terminated.Reason != "" { | ||
reason = container.State.Terminated.Reason | ||
} else if container.State.Terminated != nil && container.State.Terminated.Reason == "" { | ||
if container.State.Terminated.Signal != 0 { | ||
reason = fmt.Sprintf("Signal:%d", container.State.Terminated.Signal) | ||
} else { | ||
reason = fmt.Sprintf("ExitCode:%d", container.State.Terminated.ExitCode) | ||
} | ||
} | ||
} | ||
} | ||
|
||
if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason { | ||
reason = "Unknown" | ||
} else if pod.DeletionTimestamp != nil { | ||
reason = "Terminating" | ||
} | ||
|
||
return reason | ||
} | ||
|
||
func tableAlreadyExist(err error) bool { | ||
return strings.Contains(err.Error(), "already exists") | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function has a return error