Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: refactor fault trigger #896

Merged
merged 32 commits into from
Oct 12, 2019
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
689aa7c
refactor fault trigger
xiaojingchen Sep 10, 2019
8a7e6a1
fix config.yaml
xiaojingchen Sep 11, 2019
f836536
address comment
xiaojingchen Sep 11, 2019
8648f7e
Merge branch 'master' into agent-support-qm
xiaojingchen Sep 11, 2019
1721243
address comment
xiaojingchen Sep 12, 2019
7d1b2e1
Merge branch 'agent-support-qm' of https://github.com/xiaojingchen/ti…
xiaojingchen Sep 12, 2019
7b79f2c
fix unit tests
xiaojingchen Sep 12, 2019
b5bc4e0
fix bugs
xiaojingchen Sep 12, 2019
c6449e6
fix
xiaojingchen Sep 16, 2019
82cafca
Merge branch 'master' into agent-support-qm
xiaojingchen Sep 17, 2019
14e76a8
fix compatibility bug
xiaojingchen Sep 17, 2019
f39f241
Merge branch 'master' into agent-support-qm
cofyc Sep 18, 2019
ac4070a
add test name
xiaojingchen Sep 18, 2019
b09c8b1
Merge branch 'agent-support-qm' of https://github.com/xiaojingchen/ti…
xiaojingchen Sep 18, 2019
a3faa0e
refactor fault trigger
xiaojingchen Sep 10, 2019
54709e4
fix config.yaml
xiaojingchen Sep 11, 2019
68ddae7
address comment
xiaojingchen Sep 11, 2019
5753e98
address comment
xiaojingchen Sep 12, 2019
2cd1da6
fix unit tests
xiaojingchen Sep 12, 2019
3da2da4
fix bugs
xiaojingchen Sep 12, 2019
1f8a921
fix
xiaojingchen Sep 16, 2019
f697e92
fix compatibility bug
xiaojingchen Sep 17, 2019
466a0f4
add test name
xiaojingchen Sep 18, 2019
b48c82a
add apiserver fault trigger and check
xiaojingchen Sep 26, 2019
027eea8
fix
xiaojingchen Sep 26, 2019
f9dbde8
Merge branch 'master' into agent-support-qm
xiaojingchen Sep 27, 2019
6afcd3b
Merge branch 'master' into agent-support-qm
xiaojingchen Oct 10, 2019
ed5c31a
fix
xiaojingchen Oct 11, 2019
556238a
Merge branch 'master' into agent-support-qm
cofyc Oct 12, 2019
04af526
fix lint error
xiaojingchen Oct 12, 2019
16e083e
Merge branch 'master' into agent-support-qm
xiaojingchen Oct 12, 2019
1f4bd01
Merge branch 'agent-support-qm' of https://github.com/xiaojingchen/ti…
xiaojingchen Oct 12, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion tests/cmd/fault-trigger/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@ import (
var (
port int
pprofPort int
vmManager string
)

func init() {
flag.IntVar(&port, "port", 23332, "The port that the fault trigger's http service runs on (default 23332)")
flag.IntVar(&pprofPort, "pprof-port", 6060, "The port that the pprof's http service runs on (default 6060)")
flag.StringVar(&vmManager, "vm-manager", "virsh", "the vm manager, virsh/qm (default virsh)")

flag.Parse()
}
Expand All @@ -43,7 +45,7 @@ func main() {
logs.InitLogs()
defer logs.FlushLogs()

mgr := manager.NewManager()
mgr := manager.NewManager(vmManager)
Copy link
Contributor

@cofyc cofyc Sep 11, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better not to allow invalid --vm-manager value, we can initialize vm manager and handle errors here, e.g.

var vmMgr VMManager
if vmManager == "qm" {
   vmMgr = NewQMManager()
} else if vmManager == "virsh" {
   vmMgr = NewVirshManager()
} else {
   // fatal error
}

mgr := manager.NewManager(vmMgr)

if some users configured an invalid value, but our program still works, this will confuse people because they don't know what virtual manager we use from the command-line flags unless they know implementation details

server := api.NewServer(mgr, port)

go wait.Forever(func() {
Expand Down
8 changes: 4 additions & 4 deletions tests/cmd/stability/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,26 +266,26 @@ func run() {
// stop all kube-scheduler pods
for _, physicalNode := range cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
fta.StopKubeSchedulerOrDie(vNode)
fta.StopKubeSchedulerOrDie(vNode.IP)
}
}
oa.CheckKubeSchedulerDownOrDie(ocfg, clusters)
for _, physicalNode := range cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
fta.StartKubeSchedulerOrDie(vNode)
fta.StartKubeSchedulerOrDie(vNode.IP)
}
}

// stop all kube-controller-manager pods
for _, physicalNode := range cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
fta.StopKubeControllerManagerOrDie(vNode)
fta.StopKubeControllerManagerOrDie(vNode.IP)
}
}
oa.CheckKubeControllerManagerDownOrDie(ocfg, clusters)
for _, physicalNode := range cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
fta.StartKubeControllerManagerOrDie(vNode)
fta.StartKubeControllerManagerOrDie(vNode.IP)
}
}
}
Expand Down
9 changes: 7 additions & 2 deletions tests/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,13 @@ type Config struct {

// Nodes defines a series of nodes that belong to the same physical node.
type Nodes struct {
PhysicalNode string `yaml:"physical_node" json:"physical_node"`
Nodes []string `yaml:"nodes" json:"nodes"`
PhysicalNode string `yaml:"physical_node" json:"physical_node"`
Nodes []Node `yaml:"nodes" json:"nodes"`
}

type Node struct {
IP string `yaml:"ip" json:"ip"`
Name string `yaml:"name" json:"name"`
}

// NewConfig creates a new config.
Expand Down
52 changes: 39 additions & 13 deletions tests/fault.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (fa *faultTriggerActions) CheckAndRecoverEnv() error {
glog.Infof("ensure all nodes are running")
for _, physicalNode := range fa.cfg.Nodes {
for _, vNode := range physicalNode.Nodes {
err := fa.StartNode(physicalNode.PhysicalNode, vNode)
err := fa.StartNode(physicalNode.PhysicalNode, vNode.IP)
if err != nil {
return err
}
Expand All @@ -108,15 +108,15 @@ func (fa *faultTriggerActions) CheckAndRecoverEnv() error {
glog.Infof("ensure all static pods are running")
for _, physicalNode := range fa.cfg.APIServers {
for _, vNode := range physicalNode.Nodes {
err := fa.StartKubeAPIServer(vNode)
err := fa.StartKubeAPIServer(vNode.IP)
if err != nil {
return err
}
err = fa.StartKubeControllerManager(vNode)
err = fa.StartKubeControllerManager(vNode.IP)
if err != nil {
return err
}
err = fa.StartKubeScheduler(vNode)
err = fa.StartKubeScheduler(vNode.IP)
if err != nil {
return err
}
Expand Down Expand Up @@ -155,8 +155,13 @@ func (fa *faultTriggerActions) StopNode() (string, string, time.Time, error) {
Addr: fa.genFaultTriggerAddr(physicalNode),
})

name := getNameByIP(fa.cfg, node)
if name == "" {
return "", "", now, fmt.Errorf("failed to find %s's name in cfg:[%v]", node)
}

if err := faultCli.StopVM(&manager.VM{
IP: node,
Name: name,
}); err != nil {
glog.Errorf("failed to stop node %s on physical node: %s: %v", node, physicalNode, err)
return "", "", now, err
Expand Down Expand Up @@ -187,14 +192,16 @@ func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error
return err
}

name := getNameByIP(fa.cfg, node)

for _, vm := range vms {
if vm.IP == node && vm.Status == "running" {
if vm.Name == name && vm.Status == "running" {
return nil
}
}

if err := faultCli.StartVM(&manager.VM{
IP: node,
Name: name,
}); err != nil {
glog.Errorf("failed to start node %s on physical node %s: %v", node, physicalNode, err)
return err
Expand Down Expand Up @@ -322,7 +329,7 @@ func (fa *faultTriggerActions) StartKubeProxyOrDie() {
func (fa *faultTriggerActions) StopETCD(nodes ...string) error {
if len(nodes) == 0 {
for _, ns := range fa.cfg.ETCDs {
nodes = append(nodes, ns.Nodes...)
nodes = append(nodes, getIps(ns.Nodes)...)
}
}

Expand All @@ -346,7 +353,7 @@ func (fa *faultTriggerActions) StopETCDOrDie(nodes ...string) {
func (fa *faultTriggerActions) StopKubelet(nodes ...string) error {
if len(nodes) == 0 {
for _, ns := range fa.cfg.Nodes {
nodes = append(nodes, ns.Nodes...)
nodes = append(nodes, getIps(ns.Nodes)...)
}
}

Expand All @@ -370,7 +377,7 @@ func (fa *faultTriggerActions) StopKubeletOrDie(nodes ...string) {
func (fa *faultTriggerActions) StartKubelet(nodes ...string) error {
if len(nodes) == 0 {
for _, ns := range fa.cfg.Nodes {
nodes = append(nodes, ns.Nodes...)
nodes = append(nodes, getIps(ns.Nodes)...)
}
}

Expand All @@ -394,7 +401,7 @@ func (fa *faultTriggerActions) StartKubeletOrDie(nodes ...string) {
func (fa *faultTriggerActions) StartETCD(nodes ...string) error {
if len(nodes) == 0 {
for _, ns := range fa.cfg.ETCDs {
nodes = append(nodes, ns.Nodes...)
nodes = append(nodes, getIps(ns.Nodes)...)
}
}

Expand Down Expand Up @@ -599,7 +606,7 @@ func getPhysicalNode(faultNode string, cfg *Config) string {
var physicalNode string
for _, nodes := range cfg.Nodes {
for _, node := range nodes.Nodes {
if node == faultNode {
if node.IP == faultNode {
physicalNode = nodes.PhysicalNode
}
}
Expand All @@ -611,7 +618,26 @@ func getPhysicalNode(faultNode string, cfg *Config) string {
func getAllK8sNodes(cfg *Config) []string {
var allNodes []string
for _, nodes := range cfg.Nodes {
allNodes = append(allNodes, nodes.Nodes...)
allNodes = append(allNodes, getIps(nodes.Nodes)...)
}
return allNodes
}

func getNameByIP(cfg *Config, ip string) string {
for _, nodes := range cfg.Nodes {
for _, node := range nodes.Nodes {
if node.IP == ip {
return node.Name
}
}
}
return ""
}

func getIps(nodes []Node) []string {
var ips []string
for _, node := range nodes {
ips = append(ips, node.IP)
}
return ips
}
58 changes: 42 additions & 16 deletions tests/manifests/stability/stability-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,51 @@ data:
block_writer:
concurrency: 12
nodes:
- physical_node: 172.16.4.38
- physical_node: 172.16.5.11
nodes:
- 172.16.4.177
- 172.16.4.178
- 172.16.4.179
- physical_node: 172.16.4.37
- ip: 172.16.4.247
name: 105
- physical_node: 172.16.5.26
nodes:
- 172.16.4.180
- 172.16.4.181
- 172.16.4.182
- ip: 172.16.4.133
name: 200
- physical_node: 172.16.5.27
nodes:
- ip: 172.16.4.121
name: 203
- physical_node: 172.16.5.28
nodes:
- ip: 172.16.4.139
name: 204
- physical_node: 172.16.5.29
nodes:
- ip: 172.16.5.147
name: 137
- ip: 172.16.5.148
name: 138
etcds:
- physical_node: 172.16.4.37
- physical_node: 172.16.5.11
nodes:
- ip: 172.16.4.247
name: 105
- physical_node: 172.16.5.26
nodes:
- 172.16.4.180
- 172.16.4.181
- 172.16.4.182
- ip: 172.16.4.133
name: 200
- physical_node: 172.16.5.27
nodes:
- ip: 172.16.4.121
name: 203
apiservers:
- physical_node: 172.16.4.37
- physical_node: 172.16.5.11
nodes:
- ip: 172.16.4.247
name: 105
- physical_node: 172.16.5.26
nodes:
- ip: 172.16.4.133
name: 200
- physical_node: 172.16.5.27
nodes:
- 172.16.4.180
- 172.16.4.181
- 172.16.4.182
- ip: 172.16.4.121
name: 203
2 changes: 1 addition & 1 deletion tests/pkg/fault-trigger/api/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ func (s *Server) getVM(name string) (*manager.VM, error) {
}

for _, vm := range vms {
if name == vm.Name || name == vm.IP {
if name == vm.Name {
return vm, nil
}
}
Expand Down
6 changes: 0 additions & 6 deletions tests/pkg/fault-trigger/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,6 @@ func (c *client) StartVM(vm *manager.VM) error {
}

vmName := vm.Name
if len(vmName) == 0 {
vmName = vm.IP
}

url := util.GenURL(fmt.Sprintf("%s%s/vm/%s/start", c.cfg.Addr, api.APIPrefix, vmName))
if _, err := c.post(url, nil); err != nil {
Expand All @@ -177,9 +174,6 @@ func (c *client) StopVM(vm *manager.VM) error {
}

vmName := vm.Name
if len(vmName) == 0 {
vmName = vm.IP
}

url := util.GenURL(fmt.Sprintf("%s%s/vm/%s/stop", c.cfg.Addr, api.APIPrefix, vmName))
if _, err := c.post(url, nil); err != nil {
Expand Down
36 changes: 13 additions & 23 deletions tests/pkg/fault-trigger/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,36 +15,26 @@ package manager

import (
"fmt"
"sync"

"github.com/pingcap/tidb-operator/tests/slack"
)

// Manager to manager fault trigger
type Manager struct {
sync.RWMutex
vmCache map[string]string
VMManager
}

// NewManager returns a manager instance
func NewManager() *Manager {
return &Manager{
vmCache: make(map[string]string),
func NewManager(vmManagerName string) *Manager {
var vmManager VMManager
if vmManagerName == "qm" {
vmManager = &QMVMManager{}
} else if vmManagerName == "virsh" {
vmManager = &VirshVMManager{}
} else {
slack.NotifyAndPanic(fmt.Errorf("stability test have not supported the vm manager:[%s],please choose [qm] or [virsh].", vmManagerName))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in fault-trigger, slack.Webhook endpoint is not provided. IMO a simple fatal error is enough.

}
}

func (m *Manager) setVMCache(key, val string) {
m.Lock()
m.vmCache[key] = val
m.Unlock()
}

func (m *Manager) getVMCache(key string) (string, error) {
m.RLock()
defer m.RUnlock()

val, ok := m.vmCache[key]
if !ok {
return "", fmt.Errorf("vm %s not in cache", key)
return &Manager{
vmManager,
}

return val, nil
}
5 changes: 2 additions & 3 deletions tests/pkg/fault-trigger/manager/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,13 @@ type VM struct {
Host string `json:"host"`
Port int64 `json:"port"`
Name string `json:"name"`
IP string `json:"ip"`
Role []string `json:"role"`
Status string `json:"status"`
}

func (v *VM) Verify() error {
if len(v.Name) == 0 && len(v.IP) == 0 {
return errors.New("name or ip must be provided")
if len(v.Name) == 0 {
return errors.New("name must be provided")
}

return nil
Expand Down
Loading