Skip to content

Commit

Permalink
backend, metrics: add logs and metrics for getting a backend (pingcap…
Browse files Browse the repository at this point in the history
  • Loading branch information
djshow832 authored and xhebox committed Mar 13, 2023
1 parent c9aa412 commit 05b8b44
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 1 deletion.
2 changes: 1 addition & 1 deletion pkg/metrics/balance.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ var (
Namespace: ModuleProxy,
Subsystem: LabelBalance,
Name: "migrate_duration_millis",
Help: "Bucketed histogram of migrating time (s) of sessions.",
Help: "Bucketed histogram of migrating time (ms) of sessions.",
Buckets: prometheus.ExponentialBuckets(0.1, 2, 26), // 0.1ms ~ 1h
}, []string{LblFrom, LblTo, LblMigrateResult})
)
120 changes: 120 additions & 0 deletions pkg/metrics/grafana/tiproxy_summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -1067,6 +1067,126 @@
"title": "Balance",
"titleSize": "h6",
"type": "row"
},
{
"collapse": true,
"collapsed": true,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 16,
"panels": [
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "Duration of getting an available backend.",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 0
},
"id": 17,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [ ],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "99",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "95",
"refId": "B"
},
{
"expr": "sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) / sum(rate(tiproxy_session_get_backend_duration_millis_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "avg",
"refId": "C"
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Get Backend Duration",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Backend",
"titleSize": "h6",
"type": "row"
}
],
"refresh": "30s",
Expand Down
34 changes: 34 additions & 0 deletions pkg/metrics/grafana/tiproxy_summary.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,34 @@ local bMigDurP = graphPanel.new(
)
);

// Backend Summary
local backendRow = row.new(collapse=true, title='Backend');
local bGetDurP = graphPanel.new(
title='Get Backend Duration',
datasource=myDS,
legend_rightSide=true,
description='Duration of getting an available backend.',
format='s',
)
.addTarget(
prometheus.target(
'histogram_quantile(0.99, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))',
legendFormat='99',
)
)
.addTarget(
prometheus.target(
'histogram_quantile(0.95, sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[1m])) by (le))',
legendFormat='95',
)
)
.addTarget(
prometheus.target(
'sum(rate(tiproxy_session_get_backend_duration_millis_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance"}[30s])) / sum(rate(tiproxy_session_get_backend_duration_millis_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[30s]))',
legendFormat='avg',
)
);

// Merge together.
local panelW = 12;
local panelH = 6;
Expand Down Expand Up @@ -319,3 +347,9 @@ newDash
,
gridPos=rowPos
)
.addPanel(
backendRow
.addPanel(bGetDurP, gridPos=leftPanelPos)
,
gridPos=rowPos
)
1 change: 1 addition & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ func registerProxyMetrics() {
prometheus.MustRegister(BackendConnGauge)
prometheus.MustRegister(QueryTotalCounter)
prometheus.MustRegister(QueryDurationHistogram)
prometheus.MustRegister(GetBackendHistogram)
prometheus.MustRegister(MigrateCounter)
prometheus.MustRegister(MigrateDurationHistogram)
}
Expand Down
9 changes: 9 additions & 0 deletions pkg/metrics/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,13 @@ var (
Help: "Bucketed histogram of processing time (s) of handled queries.",
Buckets: prometheus.ExponentialBuckets(0.0005, 2, 29), // 0.5ms ~ 1.5days
}, []string{LblBackend, LblCmdType})

GetBackendHistogram = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: ModuleProxy,
Subsystem: LabelSession,
Name: "get_backend_duration_millis",
Help: "Bucketed histogram of time (ms) for getting an available backend.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 26), // 1us ~ 30s
})
)
10 changes: 10 additions & 0 deletions pkg/proxy/backend/backend_conn_mgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ func (mgr *BackendConnManager) getBackendIO(cctx ConnContext, auth *Authenticato
// - One TiDB may be just shut down and another is just started but not ready yet
bctx, cancel := context.WithTimeout(context.Background(), timeout)
selector := r.GetBackendSelector()
startTime := time.Now()
var addr string
var origErr error
io, err := backoff.RetryNotifyWithData(
Expand Down Expand Up @@ -237,6 +238,15 @@ func (mgr *BackendConnManager) getBackendIO(cctx ConnContext, auth *Authenticato
},
)
cancel()

duration := time.Since(startTime)
addGetBackendMetrics(duration)
if err != nil {
mgr.logger.Error("get backend failed", zap.Duration("duration", duration), zap.NamedError("last_err", origErr))
} else if duration >= 3*time.Second {
mgr.logger.Warn("get backend slow", zap.Duration("duration", duration), zap.NamedError("last_err", origErr),
zap.Stringer("backend_addr", mgr.backendIO.RemoteAddr()))
}
if err != nil && errors.Is(err, context.DeadlineExceeded) {
if origErr != nil {
err = origErr
Expand Down
4 changes: 4 additions & 0 deletions pkg/proxy/backend/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,7 @@ func readCmdCounter(cmd byte, addr string) (int, error) {
}
return metrics.ReadCounter(metrics.QueryTotalCounter.WithLabelValues(addr, label))
}

func addGetBackendMetrics(duration time.Duration) {
metrics.GetBackendHistogram.Observe(float64(duration.Milliseconds()))
}

0 comments on commit 05b8b44

Please sign in to comment.