-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathelastic-rules.rule
183 lines (169 loc) · 7.69 KB
/
elastic-rules.rule
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#CPU and memory
ALERT ElasticSearchCpuCritical
IF es_cpu_percentage > 95
FOR 3m
LABELS {severity="critical"}
ANNOTATIONS {
description="{{$labels.instance}} reports critical cpu usage. Please verify workload, or add another node to the cluster ",
summary="Critical CPU usage on {{$labels.instance}}"
}
# Cluster status
ALERT ElasticSearchStatusCritical
IF es_status > 1
FOR 1m
LABELS {severity="critical"}
ANNOTATIONS {
description="{{$labels.instance}} reports critical status of a ElasticSearch cluster {{$labels.cluster}}. Please check additional metrics or logs.",
summary="Critical cluster status of {{$labels.cluster}} on {{$labels.instance}}"
}
ALERT ElasticSearchStatusWarning
IF es_status == 1
FOR 30m
LABELS {severity="warning"}
ANNOTATIONS {
description="{{$labels.instance}} reports non-healthy status of ElasticSearch cluster {{$labels.cluster}}. Please check additional metrics or logs to find a root cause",
summary="NonHealthy cluster status of {{$labels.cluster}} on {{$labels.instance}}"
}
ALERT ElasticSearchUnassigedShards
IF es_unassigned_shards > 0
FOR 10m
LABELS {severity="warning"}
ANNOTATIONS {
description="There are unassigned shards for more than 3 minutes in {{$labels.cluster}} on node {{$labels.instance}}. Please check cluster performance",
summary="Unassigned shards on {{$labels.cluster}}"
}
ALERT ElasticSearchActiveShardsPercentage
IF es_shards_active_percentage < 100
FOR 10m
LABELS {severity="warning"}
ANNOTATIONS {
description="Some shards ({{$value}}%) shards are inactive on {{$labels.cluster}} for more than 3 minutes. Results from those shards are unavailable in returned results.",
summary="Non-active shards on {{$labels.cluster}}"
}
# Index parameters
ALERT ElasticSearchTooManyIndexFailures
IF delta(es_indexing_failed_count[1m]) > 0
LABELS {severity="warning"}
ANNOTATIONS {
description="There are documents indexing failures on node {{$labels.instance}}. Please check logs to get more details.",
summary="Indexing failures on {{$labels.instance}}"
}
ALERT ElasticSearchIndexIsThrootled
IF es_indexng_isthrottled > 0
FOR 10m
LABELS {severity="warning"}
ANNOTATIONS {
description="Index {{$labels.index}} is throttled for more than 10minutes. Some documents can be missing from returned results.",
summary="Index {{$labels.index}} throttled for more than 10 minutes"
}
ALERT ElasticSearchIndexUnassignedShards
IF es_index_unassigned_shards > 0
FOR 10m
LABELS {severity="warning"}
ANNOTATIONS {
description="Unassigned shards on index {{$labels.index}} from {{$labels.cluster}} for more than 3 minutes",
summary="Unassigned shards on {{$labels.index}} at {{$labels.cluster}}"
}
ALERT ElasticSearchJvmMemoryPercent
IF es_jvm_memory_heap_used_percen > 95
FOR 1m
LABELS {severity="warning"}
ANNOTATIONS {
description="{{$labels.instance}} reports high memory consumption for more than 1 minute. Please check logs for more details",
summary="High JVM memory consumption on ES node {{$labels.instance}}"
}
#Cluster settings
ALERT ElasticSearchClusterAllocationDisabled
IF es_cluster_settings{cluster_routing_allocation_enable="none"} > 0
FOR 10m
LABELS {severity="warning"}
ANNOTATIONS {
description="{{$labels.instance}} reports that cluster allocation has been disabled for {{$labels.cluster}}. Some documents can be missing from reported results.",
summary="Cluster allocation disabled on cluster {{$labels.cluster}}"
}
ALERT ElasticSearchClusterRebalanceDisabled
IF es_cluster_settings{cluster_routing_rebalance_enable="none"} > 0
FOR 10m
LABELS {severity="warning"}
ANNOTATIONS {
description="{{$labels.instance}} reports that cluster rebalance has been disabled on {{$labels.cluster}}. Some documents can be missing from reported results.",
summary="Cluster rebalance disabled on cluster {{$labels.cluster}}"
}
#Cluster settings
ALERT ElasticSearchClusterAllocationDisabledPersistently
IF es_cluster_persistent_settings{cluster_routing_allocation_enable="none"} > 0
FOR 10m
LABELS {severity="warning"}
ANNOTATIONS {
description="{{$labels.instance}} reports that cluster allocation has been disabled persistently for {{$labels.cluster}}. Some documents can be missing from reported results and restart will not help.",
summary="Cluster allocation disabled on cluster {{$labels.cluster}}"
}
ALERT ElasticSearchClusterRebalanceDisabledPersistently
IF es_cluster_persistent_settings{cluster_routing_rebalance_enable="none"} > 0
FOR 10m
LABELS {severity="warning"}
ANNOTATIONS {
description="{{$labels.instance}} reports that cluster rebalance has been disabled on {{$labels.cluster}}. Some documents can be missing from reported results and restart will not help.",
summary="Cluster rebalance disabled on cluster {{$labels.cluster}}"
}
ALERT ElasticSearchClusterReadOnly
IF es_cluster_settings{cluster_blocks_read_only="true"} > 0
FOR 1m
LABELS {severity="critical"}
ANNOTATIONS {
description="{{$labels.instance}} reports that cluster {{$labels.cluster}} is in read-only mode. New documents will be rejected.",
summary="Cluster {{$labels.cluster}} is in RO mode"
}
ALERT ElasticSearchClusterReadOnlyPersistently
IF es_cluster_persistent_settings{cluster_blocks_read_only="true"} > 0
FOR 1m
LABELS {severity="critical"}
ANNOTATIONS {
description="{{$labels.instance}} reports that cluster {{$labels.cluster}} is in read-only mode",
summary="Cluster {{$labels.cluster}} is read-only"
}
ALERT ElasticDifferentVersion
IF count(sum(es_prometheus_version) by (pluginVersion, cluster)) by (cluster) > 1
FOR 5d
LABELS {severity="warning"}
ANNOTATIONS {
description="Cluster {{$labels.cluster}} reports inconsistent versions of ES plugin",
summary="Cluster {{$labels.cluster}} is inconsistent"
}
ALERT ElasticIngestionFailed
IF delta(es_ingest_total_failed_count[1m]) > 0
LABELS {severity="warning"}
ANNOTATIONS {
description="Node {{$labels.instance}} reports failed ingestions. Some documents were lost",
summary="Node {{$labels.instance}} ingestion failed"
}
ALERT ElasticNoSpaceWithin24h
IF predict_linear(es_fs_free_bytes[1h], 24*3600) < 0
FOR 10m
LABELS {severity="warning"}
ANNOTATIONS {
description="Elasticsearch reports that space on {{ $labels.node }}({{ $labels.instance }}) will run within 24h. Please check disk usage on that host",
summary="Elasticsearch {{ $labels.node }} at {{ $labels.cluster }} will be out of disk space within 24h"
}
ALERT ElasticNoAvailableSpace
IF es_fs_path_free_bytes * 100 / es_fs_path_total_bytes < 10
FOR 10m
LABELS {severity="critical"}
ANNOTATIONS {
description="Elasticsearch reports that there are only {{ $value }}% left on {{ $labels.path }} at {{$labels.instance}}. Please check it",
summary="No Available space on {{$labels.instance}}"
}
ALERT ElasticCircuitEnabled
IF delta(es_breaker_tripped[1m]) > 0
LABELS {severity="critical"}
ANNOTATIONS {
description="Elasticsearch circuit breaker {{ $labels.circuit_name }} was enabled within last minute on {{$labels.node}} ({{$labels.instance}}). Looks like high memory pressure on this host ans some data were discarded.",
summary="Circuit breaker {{ $labels.circuit_name}} enabled on {{$labels.node}}"
}
ALERT ElasticThreadpoolRejected
IF delta(es_threadpool_rejected[1m]) > 0
LABELS {severity="warning"}
ANNOTATIONS {
description="Elasticsearch threadpool {{ $labels.threadpool }} rejected {{ $value }} tasks within last minute. Some jobs failed and never will be repeated, it could be a high CPU pressure or I/O errors. Please check node and it's state",
summary="Threadpool tasks rejected {{ $labels.threadpool}} at {{$labels.node}}"
}