diff --git a/pkg/metrics/balance.go b/pkg/metrics/balance.go index f2871328..ce158170 100644 --- a/pkg/metrics/balance.go +++ b/pkg/metrics/balance.go @@ -58,7 +58,7 @@ var ( Namespace: ModuleProxy, Subsystem: LabelBalance, Name: "migrate_duration_millis", - Help: "Bucketed histogram of migrating time (s) of sessions.", + Help: "Bucketed histogram of migrating time (ms) of sessions.", Buckets: prometheus.ExponentialBuckets(0.1, 2, 26), // 0.1ms ~ 1h }, []string{LblFrom, LblTo, LblMigrateResult}) ) diff --git a/pkg/metrics/grafana/tiproxy_summary.json b/pkg/metrics/grafana/tiproxy_summary.json index d53d93b4..40943ad0 100644 --- a/pkg/metrics/grafana/tiproxy_summary.json +++ b/pkg/metrics/grafana/tiproxy_summary.json @@ -1067,6 +1067,126 @@ "title": "Balance", "titleSize": "h6", "type": "row" + }, + { + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 16, + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Duration of getting an available backend.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 17, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "99", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "95", + "refId": "B" + }, + { + "expr": "sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) / sum(rate(tiproxy_session_get_backend_duration_millis_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "avg", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Get Backend Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Backend", + "titleSize": "h6", + "type": "row" } ], "refresh": "30s", diff --git a/pkg/metrics/grafana/tiproxy_summary.jsonnet b/pkg/metrics/grafana/tiproxy_summary.jsonnet index 37f501fd..cb8af2be 100644 --- a/pkg/metrics/grafana/tiproxy_summary.jsonnet +++ b/pkg/metrics/grafana/tiproxy_summary.jsonnet @@ -282,6 +282,34 @@ local bMigDurP = graphPanel.new( ) ); +// Backend Summary +local backendRow = row.new(collapse=true, title='Backend'); +local bGetDurP = graphPanel.new( + title='Get Backend Duration', + datasource=myDS, + legend_rightSide=true, + description='Duration of getting an available backend.', + format='s', +) +.addTarget( + prometheus.target( + 'histogram_quantile(0.99, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))', + legendFormat='99', + ) +) +.addTarget( + prometheus.target( + 'histogram_quantile(0.95, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))', + legendFormat='95', + ) +) +.addTarget( + prometheus.target( + 'sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[30s])) / sum(rate(tiproxy_session_get_backend_duration_millis_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[30s]))', + legendFormat='avg', + ) +); + // Merge together. local panelW = 12; local panelH = 6; @@ -319,3 +347,9 @@ newDash , gridPos=rowPos ) +.addPanel( + backendRow + .addPanel(bGetDurP, gridPos=leftPanelPos) + , + gridPos=rowPos +) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index bd8c5875..87eb3ac7 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -120,6 +120,7 @@ func registerProxyMetrics() { prometheus.MustRegister(BackendConnGauge) prometheus.MustRegister(QueryTotalCounter) prometheus.MustRegister(QueryDurationHistogram) + prometheus.MustRegister(GetBackendHistogram) prometheus.MustRegister(MigrateCounter) prometheus.MustRegister(MigrateDurationHistogram) } diff --git a/pkg/metrics/session.go b/pkg/metrics/session.go index f145d90d..6f5da298 100644 --- a/pkg/metrics/session.go +++ b/pkg/metrics/session.go @@ -39,4 +39,13 @@ var ( Help: "Bucketed histogram of processing time (s) of handled queries.", Buckets: prometheus.ExponentialBuckets(0.0005, 2, 29), // 0.5ms ~ 1.5days }, []string{LblBackend, LblCmdType}) + + GetBackendHistogram = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: ModuleProxy, + Subsystem: LabelSession, + Name: "get_backend_duration_millis", + Help: "Bucketed histogram of time (ms) for getting an available backend.", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 26), // 1us ~ 30s + }) ) diff --git a/pkg/proxy/backend/backend_conn_mgr.go b/pkg/proxy/backend/backend_conn_mgr.go index fd235564..c99642ae 100644 --- a/pkg/proxy/backend/backend_conn_mgr.go +++ b/pkg/proxy/backend/backend_conn_mgr.go @@ -191,6 +191,7 @@ func (mgr *BackendConnManager) getBackendIO(cctx ConnContext, auth *Authenticato // - One TiDB may be just shut down and another is just started but not ready yet bctx, cancel := context.WithTimeout(context.Background(), timeout) selector := r.GetBackendSelector() + startTime := time.Now() var addr string var origErr error io, err := backoff.RetryNotifyWithData( @@ -237,6 +238,15 @@ func (mgr *BackendConnManager) getBackendIO(cctx ConnContext, auth *Authenticato }, ) cancel() + + duration := time.Since(startTime) + addGetBackendMetrics(duration) + if err != nil { + mgr.logger.Error("get backend failed", zap.Duration("duration", duration), zap.NamedError("last_err", origErr)) + } else if duration >= 3*time.Second { + mgr.logger.Warn("get backend slow", zap.Duration("duration", duration), zap.NamedError("last_err", origErr), + zap.Stringer("backend_addr", mgr.backendIO.RemoteAddr())) + } if err != nil && errors.Is(err, context.DeadlineExceeded) { if origErr != nil { err = origErr diff --git a/pkg/proxy/backend/metrics.go b/pkg/proxy/backend/metrics.go index 7ef74ff4..a5ef888c 100644 --- a/pkg/proxy/backend/metrics.go +++ b/pkg/proxy/backend/metrics.go @@ -61,3 +61,7 @@ func readCmdCounter(cmd byte, addr string) (int, error) { } return metrics.ReadCounter(metrics.QueryTotalCounter.WithLabelValues(addr, label)) } + +func addGetBackendMetrics(duration time.Duration) { + metrics.GetBackendHistogram.Observe(float64(duration.Milliseconds())) +}