Skip to content

Commit

Permalink
monitor checker (pingcap#320)
Browse files Browse the repository at this point in the history
* monitor checker

* fix bugs
  • Loading branch information
xiaojingchen authored and weekface committed Mar 21, 2019
1 parent 3e9c07b commit 55a0b1a
Showing 1 changed file with 128 additions and 3 deletions.
131 changes: 128 additions & 3 deletions tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ package tests
import (
"bytes"
"database/sql"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"os/exec"
"strconv"
"strings"
Expand Down Expand Up @@ -50,7 +54,12 @@ func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, l

const (
DefaultPollTimeout time.Duration = 10 * time.Minute
DefaultPollInterval time.Duration = 1 * time.Minute
DefaultPollInterval time.Duration = 10 * time.Second
)

const (
grafanaUsername = "admin"
grafanaPassword = "admin"
)

type OperatorActions interface {
Expand All @@ -73,8 +82,6 @@ type OperatorActions interface {
CheckIncrementalBackup(info *TidbClusterInfo) error
Restore(from *TidbClusterInfo, to *TidbClusterInfo) error
CheckRestore(from *TidbClusterInfo, to *TidbClusterInfo) error
DeployMonitor(info *TidbClusterInfo) error
CleanMonitor(info *TidbClusterInfo) error
ForceDeploy(info *TidbClusterInfo) error
CreateSecret(info *TidbClusterInfo) error
}
Expand Down Expand Up @@ -131,6 +138,7 @@ type TidbClusterInfo struct {
InsertBatchSize string
Resources map[string]string
Args map[string]string
Monitor bool
}

func (tc *TidbClusterInfo) HelmSetString() string {
Expand All @@ -150,6 +158,7 @@ func (tc *TidbClusterInfo) HelmSetString() string {
"tidb.image": tc.TiDBImage,
"tidb.passwordSecretName": "set-secret",
"tidb.initSql": initSql,
"monitor.create": strconv.FormatBool(tc.Monitor),
}

for k, v := range tc.Resources {
Expand Down Expand Up @@ -344,6 +353,12 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterInfo) error {
return false, nil
}

if info.Monitor {
glog.Infof("check tidb monitor normal")
if b, err := oa.monitorNormal(info); !b && err == nil {
return false, nil
}
}
return true, nil
}); err != nil {
glog.Infof("check tidb cluster status failed: %s", err.Error())
Expand Down Expand Up @@ -858,6 +873,116 @@ func (oa *operatorActions) passwordIsSet(clusterInfo *TidbClusterInfo) (bool, er
return true, nil
}

func (oa *operatorActions) monitorNormal(clusterInfo *TidbClusterInfo) (bool, error) {
ns := clusterInfo.Namespace
tcName := clusterInfo.ClusterName
monitorDeploymentName := fmt.Sprintf("%s-monitor", tcName)
monitorDeployment, err := oa.kubeCli.AppsV1().Deployments(ns).Get(monitorDeploymentName, metav1.GetOptions{})
if err != nil {
glog.Errorf("get monitor deployment: [%s/%s] failed", ns, monitorDeploymentName)
return false, nil
}
if monitorDeployment.Status.ReadyReplicas < 1 {
glog.Info("monitor ready replicas %d < 1", monitorDeployment.Status.ReadyReplicas)
return false, nil
}
configuratorJobName := fmt.Sprintf("%s-monitor-configurator", tcName)
monitorJob, err := oa.kubeCli.BatchV1().Jobs(ns).Get(configuratorJobName, metav1.GetOptions{})
if err != nil {
glog.Info("get monitor configurator job: [%s/%s] failed", ns, configuratorJobName)
return false, nil
}
if monitorJob.Status.Succeeded == 0 {
glog.Info("the monitor configurator job: [%s/%s] had not success", ns, configuratorJobName)
return false, nil
}

if err := oa.checkPrometheus(clusterInfo); err != nil {
glog.Info("check [%s/%s]'s prometheus data failed: %v", ns, monitorDeploymentName, err)
return false, nil
}

if err := oa.checkGrafanaData(clusterInfo); err != nil {
glog.Info("check [%s/%s]'s grafana data failed: %v", ns, monitorDeploymentName, err)
return false, nil
}
return true, nil
}

func (oa *operatorActions) checkPrometheus(clusterInfo *TidbClusterInfo) error {
ns := clusterInfo.Namespace
tcName := clusterInfo.ClusterName
prometheusSvc := fmt.Sprintf("http://%s-prometheus.%s:9090/api/v1/query?query=up", tcName, ns)
resp, err := http.Get(prometheusSvc)
if err != nil {
return err
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return err
}
response := &struct {
Status string `json:"status"`
}{}
err = json.Unmarshal(body, response)
if err != nil {
return err
}
if response.Status != "success" {
return fmt.Errorf("the prometheus's api[%s] has not ready", prometheusSvc)
}
return nil
}

func (oa *operatorActions) checkGrafanaData(clusterInfo *TidbClusterInfo) error {
ns := clusterInfo.Namespace
tcName := clusterInfo.ClusterName
svcName := fmt.Sprintf("%s-grafana", tcName)
end := time.Now()
start := end.Add(-time.Minute)
values := url.Values{}
values.Set("query", `sum(tikv_pd_heartbeat_tick_total{type="leader"}) by (job)`)
values.Set("start", fmt.Sprintf("%d", start.Unix()))
values.Set("end", fmt.Sprintf("%d", end.Unix()))
values.Set("step", "30")
u := fmt.Sprintf("http://%s.%s.svc.cluster.local:3000/api/datasources/proxy/1/api/v1/query_range?%s", svcName, ns, values.Encode())
req, err := http.NewRequest(http.MethodGet, u, nil)
if err != nil {
return err
}
req.SetBasicAuth(grafanaUsername, grafanaPassword)
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
buf, err := ioutil.ReadAll(resp.Body)
if err != nil {
return err
}
data := struct {
Status string `json:"status"`
Data struct {
ResultType string `json:"resultType"`
Result []struct {
Metric struct {
Job string `json:"job"`
} `json:"metric"`
Values []interface{} `json:"values"`
} `json:"result"`
}
}{}
if err := json.Unmarshal(buf, &data); err != nil {
return err
}
if data.Status != "success" || len(data.Data.Result) < 1 {
return fmt.Errorf("invalid response: status: %s, result: %v", data.Status, data.Data.Result)
}
return nil
}

func getDSN(ns, tcName, databaseName, password string) string {
return fmt.Sprintf("root:%s@(%s-tidb.%s:4000)/%s?charset=utf8", password, tcName, ns, databaseName)
}
Expand Down

0 comments on commit 55a0b1a

Please sign in to comment.