forked from anchor/nagios-plugin-elasticsearch-jvm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_elasticsearch_jvm
executable file
·149 lines (117 loc) · 5.97 KB
/
check_elasticsearch_jvm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#! /usr/bin/env python
from nagioscheck import NagiosCheck, UsageError
from nagioscheck import PerformanceMetric, Status
import urllib2
try:
import json
except ImportError:
import simplejson as json
class ElasticSearchJvmCheck(NagiosCheck):
version = '0.2.0'
def __init__(self):
NagiosCheck.__init__(self)
self.add_option('H', 'host', 'host', "Hostname or network "
"address to probe. The ElasticSearch API "
"should be listening here. Defaults to "
"'localhost'.")
self.add_option('p', 'port', 'port', "TCP port to probe. "
"The ElasticSearch API should be listening "
"here. Defaults to 9200.")
self.add_option('w', 'warn', 'warn', "Warning threshold percent")
self.add_option('c', 'crit', 'crit', "Critical threshold percent")
def check(self, opts, args):
host = opts.host or "localhost"
port = int(opts.port or '9200')
#
# Data retrieval
#
# Request a bunch of useful numbers that we export as perfdata.
# Details like the number of get, search, and indexing
# operations come from here.
es_node = get_json(r'http://%s:%d/_nodes/_local/?all=true' % (host, port))
es_stats = get_json(r'http://%s:%d/_nodes/_local/'
'stats?all=true' % (host, port))
myid = es_stats['nodes'].keys()[0]
es_node_jvm = get_json(r'http://%s:%d/_nodes/%s/stats/jvm?human' % (host, port, myid))
#
# Perfdata
#
perfdata = []
def dict2perfdata(base, metrics):
for metric in metrics:
if len(metric) == 2:
label, path = metric
unit = ""
elif len(metric) > 2:
label, path, unit = metric
else:
continue
keys = path.split(".")
value = base
for key in keys:
if value is None:
break
try:
value = value[key]
except KeyError:
value = None
break
if value is not None:
metric = PerformanceMetric(label=label,
value=value,
unit=unit)
perfdata.append(metric)
# Add cluster-wide metrics first. If you monitor all of your ES
# cluster nodes with this plugin, they should all report the
# same figures for these labels. Not ideal, but 'tis better to
# graph this data multiple times than not graph it at all.
metrics = [["heap_committed", 'heap_committed_in_bytes', "B"],
["heap_used", 'heap_used_in_bytes', "B"],
["non_heap_committed", 'non_heap_committed_in_bytes', "B"],
["non_heap_used", 'non_heap_used_in_bytes', "B"],
["old_gen_used", 'pools.CMS Old Gen.used_in_bytes', "B"],
["perm_gen_used", 'pools.CMS Perm Gen.used_in_bytes', "B"],
["eden_used", 'pools.Par Eden Space.used_in_bytes', "B"],
["survivor_used", 'pools.Par Survivor Space.used_in_bytes', "B"],
["code_cache", 'pools.Code Cache.used_in_bytes', "B"]
]
dict2perfdata(es_stats['nodes'][myid]['jvm']['mem'], metrics)
metrics = [["heap_max", 'heap_max_in_bytes', "B"],
["non_heap_max", 'non_heap_max_in_bytes', "B"],
["direct_max", 'direct_max_in_bytes', "B"]]
dict2perfdata(es_node['nodes'][myid]['jvm']['mem'], metrics)
collectors = es_node_jvm["nodes"][myid]["jvm"]["gc"]["collectors"].keys()
for collector in collectors:
metrics = [["%s_collections" % collector, "collectors.%s.collection_count" % collector, "c"],
["%s_time_ms" % collector, "collectors.%s.collection_time_in_millis" % collector, "c"],
]
dict2perfdata(es_node_jvm["nodes"][myid]["jvm"]["gc"], metrics)
heap_used_b = int(es_stats['nodes'][myid]['jvm']['mem']['heap_used_in_bytes'])
heap_max_b = int(es_node ['nodes'][myid]['jvm']['mem']['heap_max_in_bytes'])
heap_used = es_node_jvm['nodes'][myid]['jvm']['mem']['heap_used']
heap_max = es_node_jvm['nodes'][myid]['jvm']['mem']['heap_max']
heap_usage_percent = (float(heap_used_b)/float(heap_max_b))*100
if opts.crit and heap_usage_percent >= float(opts.crit):
raise Status("critical","JVM Heap Usage: %d%% %s/%s" % (heap_usage_percent,str(heap_used),str(heap_max)),perfdata)
if opts.warn and heap_usage_percent >= float(opts.warn):
raise Status("warning","JVM Heap Usage: %d%% %s/%s" % (heap_usage_percent,str(heap_used),str(heap_max)),perfdata)
raise Status("ok","JVM Heap Usage: %d%% %s/%s" % (heap_usage_percent,str(heap_used),str(heap_max)),perfdata)
def get_json(uri):
try:
f = urllib2.urlopen(uri)
except urllib2.HTTPError, e:
raise Status('unknown', ("API failure retrieving %s" % uri,
None,
"API failure:\n\n%s" % str(e)))
except urllib2.URLError, e:
# The server could be down; make this CRITICAL.
raise Status('critical', (e.reason,))
body = f.read()
try:
j = json.loads(body)
except ValueError:
raise Status('unknown', ("API returned nonsense",))
return j
if __name__ == '__main__':
ElasticSearchJvmCheck().run()
# vim: ts=8 et