diff --git a/pkg/manager/cert/manager.go b/pkg/manager/cert/manager.go index 43cfafc6..21d559c8 100644 --- a/pkg/manager/cert/manager.go +++ b/pkg/manager/cert/manager.go @@ -24,6 +24,7 @@ import ( "github.com/pingcap/TiProxy/lib/util/errors" "github.com/pingcap/TiProxy/lib/util/security" "github.com/pingcap/TiProxy/lib/util/waitgroup" + "github.com/pingcap/TiProxy/pkg/metrics" "go.uber.org/zap" ) @@ -134,8 +135,9 @@ func (cm *CertManager) reload() { if err := cm.sqlTLS.Reload(cm.logger); err != nil { errs = append(errs, err) } - err := errors.Collect(errors.New("loading certs"), errs...) - if err != nil { + if len(errs) > 0 { + metrics.ServerErrCounter.WithLabelValues("load_cert").Add(float64(len(errs))) + err := errors.Collect(errors.New("loading certs"), errs...) cm.logger.Error("failed to reload some certs", zap.Error(err)) } } diff --git a/pkg/manager/router/backend_observer.go b/pkg/manager/router/backend_observer.go index a45b633c..8dc8d3a2 100644 --- a/pkg/manager/router/backend_observer.go +++ b/pkg/manager/router/backend_observer.go @@ -231,7 +231,9 @@ func (bo *BackendObserver) checkHealth(ctx context.Context, backends map[string] // Also dial the SQL port just in case that the SQL port hangs. err := connectWithRetry(func() error { + startTime := time.Now() conn, err := net.DialTimeout("tcp", addr, bo.healthCheckConfig.DialTimeout) + setPingBackendMetrics(addr, err == nil, startTime) if err == nil { if err := conn.Close(); err != nil && !pnet.IsDisconnectError(err) { bo.logger.Error("close connection in health check failed", zap.Error(err)) diff --git a/pkg/manager/router/metrics.go b/pkg/manager/router/metrics.go index 12175c5f..27156860 100644 --- a/pkg/manager/router/metrics.go +++ b/pkg/manager/router/metrics.go @@ -57,9 +57,15 @@ func addMigrateMetrics(from, to string, succeed bool, startTime time.Time) { metrics.MigrateCounter.WithLabelValues(from, to, resLabel).Inc() cost := time.Since(startTime) - metrics.MigrateDurationHistogram.WithLabelValues(from, to, resLabel).Observe(float64(cost.Milliseconds())) + metrics.MigrateDurationHistogram.WithLabelValues(from, to, resLabel).Observe(cost.Seconds()) } func readMigrateCounter(from, to string, succeed bool) (int, error) { return metrics.ReadCounter(metrics.MigrateCounter.WithLabelValues(from, to, succeedToLabel(succeed))) } + +func setPingBackendMetrics(addr string, succeed bool, startTime time.Time) { + cost := time.Since(startTime) + resLabel := succeedToLabel(succeed) + metrics.PingBackendGauge.WithLabelValues(addr, resLabel).Set(cost.Seconds()) +} diff --git a/pkg/metrics/backend.go b/pkg/metrics/backend.go new file mode 100644 index 00000000..a5ab7039 --- /dev/null +++ b/pkg/metrics/backend.go @@ -0,0 +1,57 @@ +// Copyright 2023 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import "github.com/prometheus/client_golang/prometheus" + +const ( + LblRes = "res" + LblStatus = "status" +) + +var ( + BackendStatusGauge = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: ModuleProxy, + Subsystem: LabelBackend, + Name: "b_status", + Help: "Gauge of backend status.", + }, []string{LblBackend, LblStatus}) + + GetBackendHistogram = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: ModuleProxy, + Subsystem: LabelBackend, + Name: "get_backend_duration_seconds", + Help: "Bucketed histogram of time (s) for getting an available backend.", + Buckets: prometheus.ExponentialBuckets(0.000001, 2, 26), // 1us ~ 30s + }) + + GetBackendCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: ModuleProxy, + Subsystem: LabelBackend, + Name: "get_backend", + Help: "Counter of getting backend.", + }, []string{LblRes}) + + PingBackendGauge = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: ModuleProxy, + Subsystem: LabelBackend, + Name: "ping_duration_seconds", + Help: "Time (s) of pinging the SQL port of each backend.", + }, []string{LblBackend, LblRes}) +) diff --git a/pkg/metrics/balance.go b/pkg/metrics/balance.go index ce158170..4b02f2d3 100644 --- a/pkg/metrics/balance.go +++ b/pkg/metrics/balance.go @@ -24,19 +24,10 @@ const ( LblBackend = "backend" LblFrom = "from" LblTo = "to" - LblStatus = "status" LblMigrateResult = "migrate_res" ) var ( - BackendStatusGauge = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: ModuleProxy, - Subsystem: LabelBalance, - Name: "b_status", - Help: "Gauge of backend status.", - }, []string{LblBackend, LblStatus}) - BackendConnGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: ModuleProxy, @@ -57,8 +48,8 @@ var ( prometheus.HistogramOpts{ Namespace: ModuleProxy, Subsystem: LabelBalance, - Name: "migrate_duration_millis", - Help: "Bucketed histogram of migrating time (ms) of sessions.", - Buckets: prometheus.ExponentialBuckets(0.1, 2, 26), // 0.1ms ~ 1h + Name: "migrate_duration_seconds", + Help: "Bucketed histogram of migrating time (s) of sessions.", + Buckets: prometheus.ExponentialBuckets(0.0001, 2, 26), // 0.1ms ~ 1h }, []string{LblFrom, LblTo, LblMigrateResult}) ) diff --git a/pkg/metrics/grafana/tiproxy_summary.json b/pkg/metrics/grafana/tiproxy_summary.json index 9c258f74..510626e4 100644 --- a/pkg/metrics/grafana/tiproxy_summary.json +++ b/pkg/metrics/grafana/tiproxy_summary.json @@ -256,7 +256,7 @@ "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { @@ -349,7 +349,7 @@ "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { @@ -358,6 +358,13 @@ "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" + }, + { + "expr": "sum(go_goroutines{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"tiproxy\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "total", + "refId": "B" } ], "thresholds": [ ], @@ -559,7 +566,7 @@ "refId": "B" }, { - "expr": "sum(rate(tiproxy_session_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) / sum(rate(tiproxy_session_query_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s]))", + "expr": "sum(rate(tiproxy_session_query_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) / sum(rate(tiproxy_session_query_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", @@ -1009,7 +1016,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(label_replace(label_replace(rate(tiproxy_balance_migrate_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]), \"from\", \"$1\", \"from\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\"), \"to\", \"$1\", \"to\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\")) by (from, to, migrate_res)", + "expr": "label_replace(label_replace(tiproxy_balance_migrate_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}, \"from\", \"$1\", \"from\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\"), \"to\", \"$1\", \"to\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\")", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{migrate_res}}: {{from}} => {{to}}", @@ -1095,21 +1102,21 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tiproxy_balance_migrate_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(tiproxy_balance_migrate_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum(rate(tiproxy_balance_migrate_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(tiproxy_balance_migrate_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95", "refId": "B" }, { - "expr": "sum(rate(tiproxy_balance_migrate_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) / sum(rate(tiproxy_balance_migrate_duration_millis_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s]))", + "expr": "sum(rate(tiproxy_balance_migrate_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) / sum(rate(tiproxy_balance_migrate_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", @@ -1178,7 +1185,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "Duration of getting an available backend.", + "description": "Number of getting an available backend.", "fill": 1, "fillGradient": 0, "gridPos": { @@ -1215,21 +1222,107 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "expr": "tiproxy_backend_get_backend{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{res}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Get Backend Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Duration of getting an available backend.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 19, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(tiproxy_backend_get_backend_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(tiproxy_backend_get_backend_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95", "refId": "B" }, { - "expr": "sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) / sum(rate(tiproxy_session_get_backend_duration_millis_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s]))", + "expr": "sum(rate(tiproxy_backend_get_backend_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) / sum(rate(tiproxy_backend_get_backend_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", @@ -1271,6 +1364,92 @@ "show": true } ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Duration of Pinging backends.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 20, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(tiproxy_backend_ping_duration_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}, \"backend\", \"$1\", \"backend\", \"(.+-tidb-[0-9]+).*peer.*.svc.*\")", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} | {{backend}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Ping Backend Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] } ], "repeat": null, diff --git a/pkg/metrics/grafana/tiproxy_summary.jsonnet b/pkg/metrics/grafana/tiproxy_summary.jsonnet index 4eaac935..58aabb11 100644 --- a/pkg/metrics/grafana/tiproxy_summary.jsonnet +++ b/pkg/metrics/grafana/tiproxy_summary.jsonnet @@ -92,7 +92,6 @@ local connectionP = graphPanel.new( legend_rightSide=true, description='TiProxy current connection counts.', format='short', - stack=true, ) .addTarget( prometheus.target( @@ -113,13 +112,18 @@ local goroutineP = graphPanel.new( legend_rightSide=true, description='TiProxy current goroutine counts.', format='short', - stack=true, ) .addTarget( prometheus.target( 'go_goroutines{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance", job="tiproxy"}', legendFormat='{{instance}}', ) +) +.addTarget( + prometheus.target( + 'sum(go_goroutines{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", job="tiproxy"})', + legendFormat='total', + ) ); local cpuP = graphPanel.new( @@ -199,11 +203,25 @@ local durationP = graphPanel.new( ) .addTarget( prometheus.target( - 'sum(rate(tiproxy_session_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[30s])) / sum(rate(tiproxy_session_query_duration_seconds_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[30s]))', + 'sum(rate(tiproxy_session_query_duration_seconds_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[30s])) / sum(rate(tiproxy_session_query_duration_seconds_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[30s]))', legendFormat='avg', ) ); +local durationByBackP = graphPanel.new( + title='Duration By Backend', + datasource=myDS, + legend_rightSide=true, + description='TiProxy P99 query durations by instances and backends.', + format='s', +) +.addTarget( + prometheus.target( + 'label_replace(histogram_quantile(0.99, sum(rate(tiproxy_session_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le, instance, backend)), "backend", "$1", "backend", "(.+-tidb-[0-9]+).*peer.*.svc.*")', + legendFormat='{{instance}} | {{backend}}', + ) +); + local cpsByInstP = graphPanel.new( title='CPS by Instance', datasource=myDS, @@ -271,7 +289,7 @@ local bMigCounterP = graphPanel.new( ) .addTarget( prometheus.target( - 'sum(label_replace(label_replace(rate(tiproxy_balance_migrate_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m]), "from", "$1", "from", "(.+-tidb-[0-9]+).*peer.*.svc.*"), "to", "$1", "to", "(.+-tidb-[0-9]+).*peer.*.svc.*")) by (from, to, migrate_res)', + 'label_replace(label_replace(tiproxy_balance_migrate_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}, "from", "$1", "from", "(.+-tidb-[0-9]+).*peer.*.svc.*"), "to", "$1", "to", "(.+-tidb-[0-9]+).*peer.*.svc.*")', legendFormat='{{migrate_res}}: {{from}} => {{to}}', ) ); @@ -285,19 +303,19 @@ local bMigDurP = graphPanel.new( ) .addTarget( prometheus.target( - 'histogram_quantile(0.99, sum(rate(tiproxy_balance_migrate_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))', + 'histogram_quantile(0.99, sum(rate(tiproxy_balance_migrate_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))', legendFormat='99', ) ) .addTarget( prometheus.target( - 'histogram_quantile(0.95, sum(rate(tiproxy_balance_migrate_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))', + 'histogram_quantile(0.95, sum(rate(tiproxy_balance_migrate_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))', legendFormat='95', ) ) .addTarget( prometheus.target( - 'sum(rate(tiproxy_balance_migrate_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[30s])) / sum(rate(tiproxy_balance_migrate_duration_millis_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[30s]))', + 'sum(rate(tiproxy_balance_migrate_duration_seconds_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[30s])) / sum(rate(tiproxy_balance_migrate_duration_seconds_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[30s]))', legendFormat='avg', ) ); @@ -313,23 +331,51 @@ local bGetDurP = graphPanel.new( ) .addTarget( prometheus.target( - 'histogram_quantile(0.99, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))', + 'histogram_quantile(0.99, sum(rate(tiproxy_backend_get_backend_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))', legendFormat='99', ) ) .addTarget( prometheus.target( - 'histogram_quantile(0.95, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))', + 'histogram_quantile(0.95, sum(rate(tiproxy_backend_get_backend_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))', legendFormat='95', ) ) .addTarget( prometheus.target( - 'sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[30s])) / sum(rate(tiproxy_session_get_backend_duration_millis_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[30s]))', + 'sum(rate(tiproxy_backend_get_backend_duration_seconds_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[30s])) / sum(rate(tiproxy_backend_get_backend_duration_seconds_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[30s]))', legendFormat='avg', ) ); +local bGetBeP = graphPanel.new( + title='Get Backend Count', + datasource=myDS, + legend_rightSide=true, + description='Number of getting an available backend.', + format='short', +) +.addTarget( + prometheus.target( + 'tiproxy_backend_get_backend{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}', + legendFormat='{{res}}', + ) +); + +local bPingBeP = graphPanel.new( + title='Ping Backend Duration', + datasource=myDS, + legend_rightSide=true, + description='Duration of Pinging backends.', + format='s', +) +.addTarget( + prometheus.target( + 'label_replace(tiproxy_backend_ping_duration_seconds{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}, "backend", "$1", "backend", "(.+-tidb-[0-9]+).*peer.*.svc.*")', + legendFormat='{{instance}} | {{backend}}', + ) +); + // Merge together. local panelW = 12; local panelH = 6; @@ -370,7 +416,9 @@ newDash ) .addPanel( backendRow - .addPanel(bGetDurP, gridPos=leftPanelPos) + .addPanel(bGetBeP, gridPos=leftPanelPos) + .addPanel(bGetDurP, gridPos=rightPanelPos) + .addPanel(bPingBeP, gridPos=leftPanelPos) , gridPos=rowPos ) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 12d4e45d..75c7a5f6 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -43,6 +43,7 @@ const ( LabelBalance = "balance" LabelSession = "session" LabelMonitor = "monitor" + LabelBackend = "backend" ) // MetricsManager manages metrics. @@ -114,14 +115,18 @@ func registerProxyMetrics() { prometheus.MustRegister(collectors.NewGoCollector(collectors.WithGoCollections(collectors.GoRuntimeMetricsCollection | collectors.GoRuntimeMemStatsCollection))) prometheus.MustRegister(ConnGauge) + prometheus.MustRegister(MaxProcsGauge) + prometheus.MustRegister(ServerEventCounter) + prometheus.MustRegister(ServerErrCounter) prometheus.MustRegister(TimeJumpBackCounter) prometheus.MustRegister(KeepAliveCounter) - prometheus.MustRegister(MaxProcsGauge) - prometheus.MustRegister(BackendStatusGauge) - prometheus.MustRegister(BackendConnGauge) prometheus.MustRegister(QueryTotalCounter) prometheus.MustRegister(QueryDurationHistogram) + prometheus.MustRegister(BackendStatusGauge) prometheus.MustRegister(GetBackendHistogram) + prometheus.MustRegister(GetBackendCounter) + prometheus.MustRegister(PingBackendGauge) + prometheus.MustRegister(BackendConnGauge) prometheus.MustRegister(MigrateCounter) prometheus.MustRegister(MigrateDurationHistogram) } diff --git a/pkg/metrics/server.go b/pkg/metrics/server.go index afc04683..d1b2a1fa 100644 --- a/pkg/metrics/server.go +++ b/pkg/metrics/server.go @@ -19,6 +19,13 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +const ( + LblType = "type" + + EventStart = "start" + EventClose = "close" +) + var ( ConnGauge = prometheus.NewGauge( prometheus.GaugeOpts{ @@ -36,6 +43,22 @@ var ( Help: "The value of GOMAXPROCS.", }) + ServerEventCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: ModuleProxy, + Subsystem: LabelServer, + Name: "event", + Help: "Counter of TiProxy event.", + }, []string{LblType}) + + ServerErrCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: ModuleProxy, + Subsystem: LabelServer, + Name: "err", + Help: "Counter of server error.", + }, []string{LblType}) + TimeJumpBackCounter = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: ModuleProxy, diff --git a/pkg/metrics/session.go b/pkg/metrics/session.go index 6f5da298..f145d90d 100644 --- a/pkg/metrics/session.go +++ b/pkg/metrics/session.go @@ -39,13 +39,4 @@ var ( Help: "Bucketed histogram of processing time (s) of handled queries.", Buckets: prometheus.ExponentialBuckets(0.0005, 2, 29), // 0.5ms ~ 1.5days }, []string{LblBackend, LblCmdType}) - - GetBackendHistogram = prometheus.NewHistogram( - prometheus.HistogramOpts{ - Namespace: ModuleProxy, - Subsystem: LabelSession, - Name: "get_backend_duration_millis", - Help: "Bucketed histogram of time (ms) for getting an available backend.", - Buckets: prometheus.ExponentialBuckets(0.001, 2, 26), // 1us ~ 30s - }) ) diff --git a/pkg/proxy/backend/backend_conn_mgr.go b/pkg/proxy/backend/backend_conn_mgr.go index a2c63348..028382b7 100644 --- a/pkg/proxy/backend/backend_conn_mgr.go +++ b/pkg/proxy/backend/backend_conn_mgr.go @@ -253,7 +253,7 @@ func (mgr *BackendConnManager) getBackendIO(cctx ConnContext, auth *Authenticato cancel() duration := time.Since(startTime) - addGetBackendMetrics(duration) + addGetBackendMetrics(duration, err == nil) if err != nil { mgr.logger.Error("get backend failed", zap.Duration("duration", duration), zap.NamedError("last_err", origErr)) } else if duration >= 3*time.Second { diff --git a/pkg/proxy/backend/metrics.go b/pkg/proxy/backend/metrics.go index f529a532..60d8acd8 100644 --- a/pkg/proxy/backend/metrics.go +++ b/pkg/proxy/backend/metrics.go @@ -36,6 +36,11 @@ func readCmdCounter(cmd byte, addr string) (int, error) { return metrics.ReadCounter(metrics.QueryTotalCounter.WithLabelValues(addr, label)) } -func addGetBackendMetrics(duration time.Duration) { - metrics.GetBackendHistogram.Observe(float64(duration.Milliseconds())) +func addGetBackendMetrics(duration time.Duration, succeed bool) { + metrics.GetBackendHistogram.Observe(duration.Seconds()) + lbl := "succeed" + if !succeed { + lbl = "fail" + } + metrics.GetBackendCounter.WithLabelValues(lbl).Inc() } diff --git a/pkg/server/server.go b/pkg/server/server.go index 46cfcf71..aa6a5ea0 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -81,6 +81,7 @@ func NewServer(ctx context.Context, sctx *sctx.Context) (srv *Server, err error) // setup metrics srv.MetricsManager.Init(ctx, lg.Named("metrics"), cfg.Metrics.MetricsAddr, cfg.Metrics.MetricsInterval, cfg.Proxy.Addr) + metrics.ServerEventCounter.WithLabelValues(metrics.EventStart).Inc() // setup certs if err = srv.CertManager.Init(cfg, lg.Named("cert")); err != nil { @@ -159,6 +160,8 @@ func NewServer(ctx context.Context, sctx *sctx.Context) (srv *Server, err error) } func (s *Server) Close() error { + metrics.ServerEventCounter.WithLabelValues(metrics.EventClose).Inc() + errs := make([]error, 0, 4) if s.Proxy != nil { errs = append(errs, s.Proxy.Close())