Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: refactor fault trigger #896

Merged
merged 32 commits into from
Oct 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
689aa7c
refactor fault trigger
xiaojingchen Sep 10, 2019
8a7e6a1
fix config.yaml
xiaojingchen Sep 11, 2019
f836536
address comment
xiaojingchen Sep 11, 2019
8648f7e
Merge branch 'master' into agent-support-qm
xiaojingchen Sep 11, 2019
1721243
address comment
xiaojingchen Sep 12, 2019
7d1b2e1
Merge branch 'agent-support-qm' of https://github.com/xiaojingchen/ti…
xiaojingchen Sep 12, 2019
7b79f2c
fix unit tests
xiaojingchen Sep 12, 2019
b5bc4e0
fix bugs
xiaojingchen Sep 12, 2019
c6449e6
fix
xiaojingchen Sep 16, 2019
82cafca
Merge branch 'master' into agent-support-qm
xiaojingchen Sep 17, 2019
14e76a8
fix compatibility bug
xiaojingchen Sep 17, 2019
f39f241
Merge branch 'master' into agent-support-qm
cofyc Sep 18, 2019
ac4070a
add test name
xiaojingchen Sep 18, 2019
b09c8b1
Merge branch 'agent-support-qm' of https://github.com/xiaojingchen/ti…
xiaojingchen Sep 18, 2019
a3faa0e
refactor fault trigger
xiaojingchen Sep 10, 2019
54709e4
fix config.yaml
xiaojingchen Sep 11, 2019
68ddae7
address comment
xiaojingchen Sep 11, 2019
5753e98
address comment
xiaojingchen Sep 12, 2019
2cd1da6
fix unit tests
xiaojingchen Sep 12, 2019
3da2da4
fix bugs
xiaojingchen Sep 12, 2019
1f8a921
fix
xiaojingchen Sep 16, 2019
f697e92
fix compatibility bug
xiaojingchen Sep 17, 2019
466a0f4
add test name
xiaojingchen Sep 18, 2019
b48c82a
add apiserver fault trigger and check
xiaojingchen Sep 26, 2019
027eea8
fix
xiaojingchen Sep 26, 2019
f9dbde8
Merge branch 'master' into agent-support-qm
xiaojingchen Sep 27, 2019
6afcd3b
Merge branch 'master' into agent-support-qm
xiaojingchen Oct 10, 2019
ed5c31a
fix
xiaojingchen Oct 11, 2019
556238a
Merge branch 'master' into agent-support-qm
cofyc Oct 12, 2019
04af526
fix lint error
xiaojingchen Oct 12, 2019
16e083e
Merge branch 'master' into agent-support-qm
xiaojingchen Oct 12, 2019
1f4bd01
Merge branch 'agent-support-qm' of https://github.com/xiaojingchen/ti…
xiaojingchen Oct 12, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ type OperatorActions interface {
CheckEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
CheckKubeletDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
CheckAllApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig)
CheckKubeProxyDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig)
CheckKubeSchedulerDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig)
CheckKubeControllerManagerDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig)
Expand Down Expand Up @@ -789,8 +790,10 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error
}

glog.V(4).Infof("check all pd and tikv instances have not pod scheduling annotation")
if b, err := oa.podsScheduleAnnHaveDeleted(tc); !b && err == nil {
return false, nil
if info.OperatorTag != "v1.0.0" {
if b, err := oa.podsScheduleAnnHaveDeleted(tc); !b && err == nil {
return false, nil
}
}

glog.V(4).Infof("check store labels")
Expand Down Expand Up @@ -1875,12 +1878,14 @@ func (oa *operatorActions) checkoutTag(tagName string) error {
cmd := fmt.Sprintf("cd %s && git stash -u && git checkout %s && "+
"mkdir -p %s && cp -rf charts/tidb-operator %s && "+
"cp -rf charts/tidb-cluster %s && cp -rf charts/tidb-backup %s &&"+
"cp -rf manifests %s &&"+
"cp -rf charts/tidb-drainer %s",
"cp -rf manifests %s",
oa.cfg.OperatorRepoDir, tagName,
filepath.Join(oa.cfg.ChartDir, tagName), oa.operatorChartPath(tagName),
oa.tidbClusterChartPath(tagName), oa.backupChartPath(tagName),
oa.manifestPath(tagName), oa.drainerChartPath(tagName))
oa.manifestPath(tagName))
if tagName != "v1.0.0" {
cmd = cmd + fmt.Sprintf(" && cp -rf charts/tidb-drainer %s", oa.drainerChartPath(tagName))
}
glog.Info(cmd)
res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput()
if err != nil {
Expand Down
4 changes: 3 additions & 1 deletion tests/cmd/fault-trigger/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@ import (
var (
port int
pprofPort int
vmManager string
)

func init() {
flag.IntVar(&port, "port", 23332, "The port that the fault trigger's http service runs on (default 23332)")
flag.IntVar(&pprofPort, "pprof-port", 6060, "The port that the pprof's http service runs on (default 6060)")
flag.StringVar(&vmManager, "vm-manager", "virsh", "the vm manager, virsh/qm (default virsh)")

flag.Parse()
}
Expand All @@ -43,7 +45,7 @@ func main() {
logs.InitLogs()
defer logs.FlushLogs()

mgr := manager.NewManager()
mgr := manager.NewManager(vmManager)
Copy link
Contributor

@cofyc cofyc Sep 11, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better not to allow invalid --vm-manager value, we can initialize vm manager and handle errors here, e.g.

var vmMgr VMManager
if vmManager == "qm" {
   vmMgr = NewQMManager()
} else if vmManager == "virsh" {
   vmMgr = NewVirshManager()
} else {
   // fatal error
}

mgr := manager.NewManager(vmMgr)

if some users configured an invalid value, but our program still works, this will confuse people because they don't know what virtual manager we use from the command-line flags unless they know implementation details

server := api.NewServer(mgr, port)

go wait.Forever(func() {
Expand Down
46 changes: 37 additions & 9 deletions tests/cmd/stability/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (
"github.com/pingcap/tidb-operator/tests/pkg/client"
"github.com/pingcap/tidb-operator/tests/slack"
"github.com/robfig/cron"
v1 "k8s.io/api/core/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/util/logs"
)
Expand Down Expand Up @@ -270,28 +270,51 @@ func run() {
// stop all kube-scheduler pods
for _, physicalNode := range cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
fta.StopKubeSchedulerOrDie(vNode)
fta.StopKubeSchedulerOrDie(vNode.IP)
}
}
oa.CheckKubeSchedulerDownOrDie(ocfg, clusters)
for _, physicalNode := range cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
fta.StartKubeSchedulerOrDie(vNode)
fta.StartKubeSchedulerOrDie(vNode.IP)
}
}

// stop all kube-controller-manager pods
for _, physicalNode := range cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
fta.StopKubeControllerManagerOrDie(vNode)
fta.StopKubeControllerManagerOrDie(vNode.IP)
}
}
oa.CheckKubeControllerManagerDownOrDie(ocfg, clusters)
for _, physicalNode := range cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
fta.StartKubeControllerManagerOrDie(vNode)
fta.StartKubeControllerManagerOrDie(vNode.IP)
}
}

// stop one kube-apiserver pod
faultApiServer := tests.SelectNode(cfg.APIServers)
fta.StopKubeAPIServerOrDie(faultApiServer)
defer fta.StartKubeAPIServerOrDie(faultApiServer)
time.Sleep(3 * time.Minute)
oa.CheckOneApiserverDownOrDie(ocfg, clusters, faultApiServer)
fta.StartKubeAPIServerOrDie(faultApiServer)

time.Sleep(time.Minute)
// stop all kube-apiserver pods
for _, physicalNode := range cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
fta.StopKubeAPIServerOrDie(vNode.IP)
}
}
oa.CheckAllApiserverDownOrDie(ocfg, clusters)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should ensure all pods not recreated after stopping all apiservers, just like: https://github.com/pingcap/tidb-operator/pull/955/files#diff-67dacabf080ca4dc8e95d2acf12ee36bR489

Copy link
Contributor Author

@xiaojingchen xiaojingchen Sep 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the pr:#955 have not merge.
In my opinions, we can add this checkpoint in a new PR.

for _, physicalNode := range cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
fta.StartKubeAPIServerOrDie(vNode.IP)
}
}
time.Sleep(time.Minute)
}

// before operator upgrade
Expand All @@ -305,11 +328,13 @@ func run() {
IsAdditional: false,
IncrementalType: tests.DbTypeTiDB,
},
{
}
if ocfg.Tag != "v1.0.0" {
backupTargets = append(backupTargets, tests.BackupTarget{
TargetCluster: fileRestoreCluster1,
IsAdditional: true,
IncrementalType: tests.DbTypeFile,
},
})
}
caseFn(preUpgrade, onePDCluster1, backupTargets, upgradeVersions[0])

Expand All @@ -334,11 +359,14 @@ func run() {
IsAdditional: false,
IncrementalType: tests.DbTypeTiDB,
},
{
}

if ocfg.Tag != "v1.0.0" {
postUpgradeBackupTargets = append(postUpgradeBackupTargets, tests.BackupTarget{
TargetCluster: fileRestoreCluster2,
IsAdditional: true,
IncrementalType: tests.DbTypeFile,
},
})
}
// caseFn(postUpgrade, restoreCluster2, tidbUpgradeVersion)
caseFn(postUpgrade, onePDCluster2, postUpgradeBackupTargets, v)
Expand Down
10 changes: 8 additions & 2 deletions tests/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,13 @@ type Config struct {

// Nodes defines a series of nodes that belong to the same physical node.
type Nodes struct {
PhysicalNode string `yaml:"physical_node" json:"physical_node"`
Nodes []string `yaml:"nodes" json:"nodes"`
PhysicalNode string `yaml:"physical_node" json:"physical_node"`
Nodes []Node `yaml:"nodes" json:"nodes"`
}

type Node struct {
IP string `yaml:"ip" json:"ip"`
Name string `yaml:"name" json:"name"`
}

// NewConfig creates a new config.
Expand Down Expand Up @@ -92,6 +97,7 @@ func NewConfig() (*Config, error) {
flag.StringVar(&cfg.OperatorRepoUrl, "operator-repo-url", "https://github.com/pingcap/tidb-operator.git", "tidb-operator repo url used")
flag.StringVar(&cfg.ChartDir, "chart-dir", "", "chart dir")
flag.StringVar(&slack.WebhookURL, "slack-webhook-url", "", "slack webhook url")
flag.StringVar(&slack.TestName, "test-name", "operator-test", "the stability test name")
flag.Parse()

operatorRepo, err := ioutil.TempDir("", "tidb-operator")
Expand Down
13 changes: 12 additions & 1 deletion tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorCo
slack.NotifyAndPanic(fmt.Errorf("can't find kube-proxy in k8s cluster"))
}
if proxyPod != nil {
affectedPods[dnsPod.GetName()] = proxyPod
affectedPods[proxyPod.GetName()] = proxyPod
}
KeepOrDie(3*time.Second, 10*time.Minute, func() error {
err := oa.CheckK8sAvailable(map[string]string{faultNode: faultNode}, affectedPods)
Expand All @@ -658,6 +658,17 @@ func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorCo
})
}

func (oa *operatorActions) CheckAllApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig) {
KeepOrDie(3*time.Second, 10*time.Minute, func() error {
err := oa.CheckTidbClustersAvailable(clusters)
if err != nil {
return err
}
glog.V(4).Infof("all clusters is available")
return nil
})
}

func (oa *operatorActions) CheckOperatorDownOrDie(clusters []*TidbClusterConfig) {
glog.Infof("checking k8s/tidbCluster status when operator down")

Expand Down
52 changes: 39 additions & 13 deletions tests/fault.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (fa *faultTriggerActions) CheckAndRecoverEnv() error {
glog.Infof("ensure all nodes are running")
for _, physicalNode := range fa.cfg.Nodes {
for _, vNode := range physicalNode.Nodes {
err := fa.StartNode(physicalNode.PhysicalNode, vNode)
err := fa.StartNode(physicalNode.PhysicalNode, vNode.IP)
if err != nil {
return err
}
Expand All @@ -108,15 +108,15 @@ func (fa *faultTriggerActions) CheckAndRecoverEnv() error {
glog.Infof("ensure all static pods are running")
for _, physicalNode := range fa.cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
err := fa.StartKubeAPIServer(vNode)
err := fa.StartKubeAPIServer(vNode.IP)
if err != nil {
return err
}
err = fa.StartKubeControllerManager(vNode)
err = fa.StartKubeControllerManager(vNode.IP)
if err != nil {
return err
}
err = fa.StartKubeScheduler(vNode)
err = fa.StartKubeScheduler(vNode.IP)
if err != nil {
return err
}
Expand Down Expand Up @@ -155,8 +155,13 @@ func (fa *faultTriggerActions) StopNode() (string, string, time.Time, error) {
Addr: fa.genFaultTriggerAddr(physicalNode),
})

name := getNameByIP(fa.cfg, node)
if name == "" {
return "", "", now, fmt.Errorf("failed to find %s's name in cfg:[%v]", node, fa.cfg)
}

if err := faultCli.StopVM(&manager.VM{
IP: node,
Name: name,
}); err != nil {
glog.Errorf("failed to stop node %s on physical node: %s: %v", node, physicalNode, err)
return "", "", now, err
Expand Down Expand Up @@ -187,14 +192,16 @@ func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error
return err
}

name := getNameByIP(fa.cfg, node)

for _, vm := range vms {
if vm.IP == node && vm.Status == "running" {
if vm.Name == name && vm.Status == "running" {
return nil
}
}

if err := faultCli.StartVM(&manager.VM{
IP: node,
Name: name,
}); err != nil {
glog.Errorf("failed to start node %s on physical node %s: %v", node, physicalNode, err)
return err
Expand Down Expand Up @@ -322,7 +329,7 @@ func (fa *faultTriggerActions) StartKubeProxyOrDie() {
func (fa *faultTriggerActions) StopETCD(nodes ...string) error {
if len(nodes) == 0 {
for _, ns := range fa.cfg.ETCDs {
nodes = append(nodes, ns.Nodes...)
nodes = append(nodes, getIps(ns.Nodes)...)
}
}

Expand All @@ -346,7 +353,7 @@ func (fa *faultTriggerActions) StopETCDOrDie(nodes ...string) {
func (fa *faultTriggerActions) StopKubelet(nodes ...string) error {
if len(nodes) == 0 {
for _, ns := range fa.cfg.Nodes {
nodes = append(nodes, ns.Nodes...)
nodes = append(nodes, getIps(ns.Nodes)...)
}
}

Expand All @@ -370,7 +377,7 @@ func (fa *faultTriggerActions) StopKubeletOrDie(nodes ...string) {
func (fa *faultTriggerActions) StartKubelet(nodes ...string) error {
if len(nodes) == 0 {
for _, ns := range fa.cfg.Nodes {
nodes = append(nodes, ns.Nodes...)
nodes = append(nodes, getIps(ns.Nodes)...)
}
}

Expand All @@ -394,7 +401,7 @@ func (fa *faultTriggerActions) StartKubeletOrDie(nodes ...string) {
func (fa *faultTriggerActions) StartETCD(nodes ...string) error {
if len(nodes) == 0 {
for _, ns := range fa.cfg.ETCDs {
nodes = append(nodes, ns.Nodes...)
nodes = append(nodes, getIps(ns.Nodes)...)
}
}

Expand Down Expand Up @@ -599,7 +606,7 @@ func getPhysicalNode(faultNode string, cfg *Config) string {
var physicalNode string
for _, nodes := range cfg.Nodes {
for _, node := range nodes.Nodes {
if node == faultNode {
if node.IP == faultNode {
physicalNode = nodes.PhysicalNode
}
}
Expand All @@ -611,7 +618,26 @@ func getPhysicalNode(faultNode string, cfg *Config) string {
func getAllK8sNodes(cfg *Config) []string {
var allNodes []string
for _, nodes := range cfg.Nodes {
allNodes = append(allNodes, nodes.Nodes...)
allNodes = append(allNodes, getIps(nodes.Nodes)...)
}
return allNodes
}

func getNameByIP(cfg *Config, ip string) string {
for _, nodes := range cfg.Nodes {
for _, node := range nodes.Nodes {
if node.IP == ip {
return node.Name
}
}
}
return ""
}

func getIps(nodes []Node) []string {
var ips []string
for _, node := range nodes {
ips = append(ips, node.IP)
}
return ips
}
Loading