Skip to content
This repository has been archived by the owner on Jun 24, 2021. It is now read-only.

Commit

Permalink
add tiflash alert rules & refactor tiflash grafana dashboard (#1158) (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
marsishandsome authored Feb 24, 2020
1 parent 3466fb5 commit 1dd10d8
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 117 deletions.
86 changes: 86 additions & 0 deletions roles/prometheus/files/tiflash.rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
groups:
- name: alert.rules
rules:
- alert: TiFlash_memory_abnormal
expr: tiflash_system_asynchronous_metric_jemalloc_allocated > 1.6e+10
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: tiflash_system_asynchronous_metric_jemalloc_allocated > 1.6e+10
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiFlash memory usage is over 16 GB

- alert: TiFlash_tmt_merge_duration
expr: histogram_quantile(0.99, sum(rate(tiflash_tmt_merge_duration_seconds_bucket[1m])) BY (le, instance)) > 600
for: 1m
labels:
env: ENV_LABELS_ENV
level: emergency
expr: histogram_quantile(0.99, sum(rate(tiflash_tmt_merge_duration_seconds_bucket[1m])) BY (le, instance)) > 600
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiFlash tmt merge duration 99th percentile is above 600s

- alert: TiFlash_tmt_write_parts_duration
expr: histogram_quantile(0.99, sum(rate(tiflash_tmt_write_parts_duration_seconds_bucket[1m])) BY (le, instance)) > 8
for: 1m
labels:
env: ENV_LABELS_ENV
level: emergency
expr: histogram_quantile(0.99, sum(rate(tiflash_tmt_write_parts_duration_seconds_bucket[1m])) BY (le, instance)) > 8
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiFlash tmt write parts duration 99th percentile is above 8s

- alert: TiFlash_schema_error
expr: increase(tiflash_schema_apply_count{type="failed"}[15m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: emergency
expr: increase(tiflash_schema_apply_count{type="failed"}[15m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiFlash schema error

- alert: TiFlash_schema_apply_duration
expr: histogram_quantile(0.99, sum(rate(tiflash_schema_apply_duration_seconds_bucket[1m])) BY (le, instance)) > 20
for: 1m
labels:
env: ENV_LABELS_ENV
level: emergency
expr: histogram_quantile(0.99, sum(rate(tiflash_schema_apply_duration_seconds_bucket[1m])) BY (le, instance)) > 20
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiFlash schema apply duration 99th percentile is above 20s

- alert: TiFlash_raft_read_index_duration
expr: histogram_quantile(0.99, sum(rate(tiflash_raft_read_index_duration_seconds_bucket[1m])) BY (le, instance)) > 3
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: histogram_quantile(0.99, sum(rate(tiflash_raft_read_index_duration_seconds_bucket[1m])) BY (le, instance)) > 3
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiFlash raft read index duration 99th percentile is above 3s

- alert: TiFlash_raft_wait_index_duration
expr: histogram_quantile(0.99, sum(rate(tiflash_raft_wait_index_duration_seconds_bucket[1m])) BY (le, instance)) > 2
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: histogram_quantile(0.99, sum(rate(tiflash_raft_wait_index_duration_seconds_bucket[1m])) BY (le, instance)) > 2
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiFlash raft wait index duration 99th percentile is above 2s
2 changes: 2 additions & 0 deletions roles/prometheus/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
- blacker.rules.yml
- kafka.rules.yml
- lightning.rules.yml
- tiflash.rules.yml
register: alert_rules_st

- name: backup alert rules file
Expand All @@ -52,6 +53,7 @@
- binlog.rules.yml
- blacker.rules.yml
- kafka.rules.yml
- tiflash.rules.yml

- include_tasks: "{{ deployment_method }}_deployment.yml"

Expand Down
1 change: 1 addition & 0 deletions roles/prometheus/templates/prometheus.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ rule_files:
- 'tidb.rules.yml'
- 'tikv.rules.yml'
- 'tikv.accelerate.rules.yml'
- 'tiflash.rules.yml'
{% if enable_binlog|default(false) %}
- 'binlog.rules.yml'
{% endif %}
Expand Down
117 changes: 2 additions & 115 deletions scripts/tiflash_proxy_summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -815,21 +815,12 @@
"refId": "B",
"step": 4
},
{
"expr": "sum(rate(tiflash_proxy_tikv_coprocessor_request_error{instance=~\"$instance\", type='full'}[1m])) by (instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "coprocessor-{{instance}}",
"metric": "",
"refId": "C",
"step": 4
},
{
"expr": "avg(tiflash_proxy_tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile99\"}) by (instance)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "stall-{{instance}}",
"refId": "D"
"refId": "C"
}
],
"thresholds": [],
Expand Down Expand Up @@ -977,110 +968,6 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"decimals": 1,
"description": "The count of dropped leader in each TiKV instance",
"editable": true,
"error": false,
"fill": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 9
},
"id": 1722,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": true,
"hideZero": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 250,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "total",
"lines": false
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(delta(tiflash_proxy_tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}[1m])) by (instance)",
"format": "time_series",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Leader drop",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"repeat": null,
Expand Down Expand Up @@ -2125,4 +2012,4 @@
"title": "TiFlash-Proxy-Summary",
"uid": "myoLjZQWz",
"version": 18
}
}
4 changes: 2 additions & 2 deletions scripts/tiflash_summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -1861,7 +1861,7 @@
}
],
"repeat": null,
"title": "TMT",
"title": "Storage",
"type": "row"
},
{
Expand Down Expand Up @@ -2247,4 +2247,4 @@
"title": "Test-Cluster-TiFlash-Summary",
"uid": "SVbh2xUWk",
"version": 2
}
}

0 comments on commit 1dd10d8

Please sign in to comment.