diff --git a/checks.d/mesos_master.py b/checks.d/mesos_master.py new file mode 100644 index 0000000000..b2485124b2 --- /dev/null +++ b/checks.d/mesos_master.py @@ -0,0 +1,249 @@ +"""Mesos Master check + +Collects metrics from mesos master node, only the leader is sending metrics. +""" +# stdlib +from hashlib import md5 +import time + +# project +from checks import AgentCheck + +# 3rd party +import requests + + +class MesosMaster(AgentCheck): + GAUGE = AgentCheck.gauge + RATE = AgentCheck.rate + SERVICE_CHECK_NAME = "mesos_master.can_connect" + + FRAMEWORK_METRICS = { + 'cpus' : ('mesos.state.framework.cpu', GAUGE), + 'mem' : ('mesos.state.framework.mem', GAUGE), + 'disk' : ('mesos.state.framework.disk', GAUGE), + } + + ROLE_RESOURCES_METRICS = { + 'cpus' : ('mesos.role.cpu', GAUGE), + 'mem' : ('mesos.role.mem', GAUGE), + 'disk' : ('mesos.role.disk', GAUGE), + } + + STATE_METRICS = { + 'deactivated_slaves' : ('mesos.state.deactivated_slaves', GAUGE), + 'failed_tasks' : ('mesos.state.failed_tasks', GAUGE), + 'finished_tasks' : ('mesos.state.finished_tasks', GAUGE), + 'killed_tasks' : ('mesos.state.killed_tasks', GAUGE), + 'lost_tasks' : ('mesos.state.lost_tasks', GAUGE), + 'staged_tasks' : ('mesos.state.staged_tasks', GAUGE), + 'started_tasks' : ('mesos.state.started_tasks', GAUGE), + } + + STATS_METRICS = { + 'activated_slaves' : ('mesos.stats.activated_slaves', GAUGE), + 'active_schedulers' : ('mesos.stats.active_schedulers', GAUGE), + 'active_tasks_gauge' : ('mesos.stats.active_tasks_gauge', GAUGE), + 'cpus_percent' : ('mesos.stats.cpus_percent', GAUGE), + 'cpus_total' : ('mesos.stats.cpus_total', GAUGE), + 'cpus_used' : ('mesos.stats.cpus_used', GAUGE), + 'deactivated_slaves' : ('mesos.stats.deactivated_slaves', GAUGE), + 'disk_percent' : ('mesos.stats.disk_percent', GAUGE), + 'disk_total' : ('mesos.stats.disk_total', GAUGE), + 'disk_used' : ('mesos.stats.disk_used', GAUGE), + 'elected' : ('mesos.stats.elected', GAUGE), + 'failed_tasks' : ('mesos.stats.failed_tasks', GAUGE), + 'finished_tasks' : ('mesos.stats.finished_tasks', GAUGE), + 'invalid_status_updates' : ('mesos.stats.invalid_status_updates', GAUGE), + 'killed_tasks' : ('mesos.stats.killed_tasks', GAUGE), + 'lost_tasks' : ('mesos.stats.lost_tasks', GAUGE), + 'total_schedulers' : ('mesos.stats.total_schedulers', GAUGE), + 'uptime' : ('mesos.stats.uptime', GAUGE), + 'valid_status_updates' : ('mesos.stats.valid_status_updates', GAUGE), + 'mem_percent' : ('mesos.stats.mem_percent', GAUGE), + 'mem_total' : ('mesos.stats.mem_total', GAUGE), + 'mem_used' : ('mesos.stats.mem_used', GAUGE), + 'outstanding_offers' : ('mesos.stats.outstanding_offers', GAUGE), + 'staged_tasks' : ('mesos.stats.staged_tasks', GAUGE), + 'started_tasks' : ('mesos.stats.started_tasks', GAUGE), + 'master/cpus_percent' : ('mesos.stats.master.cpus_percent', GAUGE), + 'master/cpus_total' : ('mesos.stats.master.cpus_total', GAUGE), + 'master/cpus_used' : ('mesos.stats.master.cpus_used', GAUGE), + 'master/disk_percent' : ('mesos.stats.master.disk_percent', GAUGE), + 'master/disk_total' : ('mesos.stats.master.disk_total', GAUGE), + 'master/disk_used' : ('mesos.stats.master.disk_used', GAUGE), + 'master/dropped_messages' : ('mesos.stats.master.dropped_messages', GAUGE), + 'master/elected' : ('mesos.stats.master.elected', GAUGE), + 'master/event_queue_dispatches' : ('mesos.stats.master.event_queue_dispatches', GAUGE), + 'master/event_queue_http_requests' : ('mesos.stats.master.event_queue_http_requests', GAUGE), + 'master/event_queue_messages' : ('mesos.stats.master.event_queue_messages', GAUGE), + 'master/frameworks_active' : ('mesos.stats.master.frameworks_active', GAUGE), + 'master/frameworks_connected' : ('mesos.stats.master.frameworks_connected', GAUGE), + 'master/frameworks_disconnected' : ('mesos.stats.master.frameworks_disconnected', GAUGE), + 'master/frameworks_inactive' : ('mesos.stats.master.frameworks_inactive', GAUGE), + 'master/invalid_framework_to_executor_messages' : ('mesos.stats.master.invalid_framework_to_executor_messages', GAUGE), + 'master/invalid_status_update_acknowledgements' : ('mesos.stats.master.invalid_status_update_acknowledgements', GAUGE), + 'master/invalid_status_updates' : ('mesos.stats.master.invalid_status_updates', GAUGE), + 'master/mem_percent' : ('mesos.stats.master.mem_percent', GAUGE), + 'master/mem_total' : ('mesos.stats.master.mem_total', GAUGE), + 'master/mem_used' : ('mesos.stats.master.mem_used', GAUGE), + 'master/messages_authenticate' : ('mesos.stats.master.messages_authenticate', GAUGE), + 'master/messages_deactivate_framework' : ('mesos.stats.master.messages_deactivate_framework', GAUGE), + 'master/messages_decline_offers' : ('mesos.stats.master.messages_decline_offers', GAUGE), + 'master/messages_exited_executor' : ('mesos.stats.master.messages_exited_executor', GAUGE), + 'master/messages_framework_to_executor' : ('mesos.stats.master.messages_framework_to_executor', GAUGE), + 'master/messages_kill_task' : ('mesos.stats.master.messages_kill_task', GAUGE), + 'master/messages_launch_tasks' : ('mesos.stats.master.messages_launch_tasks', GAUGE), + 'master/messages_reconcile_tasks' : ('mesos.stats.master.messages_reconcile_tasks', GAUGE), + 'master/messages_register_framework' : ('mesos.stats.master.messages_register_framework', GAUGE), + 'master/messages_register_slave' : ('mesos.stats.master.messages_register_slave', GAUGE), + 'master/messages_reregister_framework' : ('mesos.stats.master.messages_reregister_framework', GAUGE), + 'master/messages_reregister_slave' : ('mesos.stats.master.messages_reregister_slave', GAUGE), + 'master/messages_resource_request' : ('mesos.stats.master.messages_resource_request', GAUGE), + 'master/messages_revive_offers' : ('mesos.stats.master.messages_revive_offers', GAUGE), + 'master/messages_status_update' : ('mesos.stats.master.messages_status_update', GAUGE), + 'master/messages_status_update_acknowledgement' : ('mesos.stats.master.messages_status_update_acknowledgement', GAUGE), + 'master/messages_unregister_framework' : ('mesos.stats.master.messages_unregister_framework', GAUGE), + 'master/messages_unregister_slave' : ('mesos.stats.master.messages_unregister_slave', GAUGE), + 'master/outstanding_offers' : ('mesos.stats.master.outstanding_offers', GAUGE), + 'master/recovery_slave_removals' : ('mesos.stats.master.recovery_slave_removals', GAUGE), + 'master/slave_registrations' : ('mesos.stats.master.slave_registrations', GAUGE), + 'master/slave_removals' : ('mesos.stats.master.slave_removals', GAUGE), + 'master/slave_reregistrations' : ('mesos.stats.master.slave_reregistrations', GAUGE), + 'master/slave_shutdowns_canceled' : ('mesos.stats.master.slave_shutdowns_canceled', GAUGE), + 'master/slave_shutdowns_scheduled' : ('mesos.stats.master.slave_shutdowns_scheduled', GAUGE), + 'master/slaves_active' : ('mesos.stats.master.slaves_active', GAUGE), + 'master/slaves_connected' : ('mesos.stats.master.slaves_connected', GAUGE), + 'master/slaves_disconnected' : ('mesos.stats.master.slaves_disconnected', GAUGE), + 'master/slaves_inactive' : ('mesos.stats.master.slaves_inactive', GAUGE), + 'master/tasks_error' : ('mesos.stats.master.tasks_error', GAUGE), + 'master/tasks_failed' : ('mesos.stats.master.tasks_failed', GAUGE), + 'master/tasks_finished' : ('mesos.stats.master.tasks_finished', GAUGE), + 'master/tasks_killed' : ('mesos.stats.master.tasks_killed', GAUGE), + 'master/tasks_lost' : ('mesos.stats.master.tasks_lost', GAUGE), + 'master/tasks_running' : ('mesos.stats.master.tasks_running', GAUGE), + 'master/tasks_staging' : ('mesos.stats.master.tasks_staging', GAUGE), + 'master/tasks_starting' : ('mesos.stats.master.tasks_starting', GAUGE), + 'master/uptime_secs' : ('mesos.stats.master.uptime_secs', GAUGE), + 'master/valid_framework_to_executor_messages' : ('mesos.stats.master.valid_framework_to_executor_messages', GAUGE), + 'master/valid_status_update_acknowledgements' : ('mesos.stats.master.valid_status_update_acknowledgements', GAUGE), + 'master/valid_status_updates' : ('mesos.stats.master.valid_status_updates', GAUGE), + 'registrar/queued_operations' : ('mesos.stats.registrar.queued_operations', GAUGE), + 'registrar/registry_size_bytes' : ('mesos.stats.registrar.registry_size_bytes', GAUGE), + 'registrar/state_fetch_ms' : ('mesos.stats.registrar.state_fetch_ms', GAUGE), + 'registrar/state_store_ms' : ('mesos.stats.registrar.state_store_ms', GAUGE), + 'registrar/state_store_ms/count' : ('mesos.stats.registrar.state_store_ms.count', GAUGE), + 'registrar/state_store_ms/max' : ('mesos.stats.registrar.state_store_ms.max', GAUGE), + 'registrar/state_store_ms/min' : ('mesos.stats.registrar.state_store_ms.min', GAUGE), + 'registrar/state_store_ms/p50' : ('mesos.stats.registrar.state_store_ms.p50', GAUGE), + 'registrar/state_store_ms/p90' : ('mesos.stats.registrar.state_store_ms.p90', GAUGE), + 'registrar/state_store_ms/p95' : ('mesos.stats.registrar.state_store_ms.p95', GAUGE), + 'registrar/state_store_ms/p99' : ('mesos.stats.registrar.state_store_ms.p99', GAUGE), + 'registrar/state_store_ms/p999' : ('mesos.stats.registrar.state_store_ms.p999', GAUGE), + 'registrar/state_store_ms/p9999' : ('mesos.stats.registrar.state_store_ms.p9999', GAUGE), + 'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE), + 'system/load_15min' : ('mesos.stats.system.load_15min', RATE), + 'system/load_1min' : ('mesos.stats.system.load_1min', RATE), + 'system/load_5min' : ('mesos.stats.system.load_5min', RATE), + 'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE), + 'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE), + } + + def _timeout_event(self, url, timeout, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'URL timeout', + 'msg_text': '%s timed out after %s seconds.' % (url, timeout), + 'aggregation_key': aggregation_key + }) + + def _status_code_event(self, url, r, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'Invalid reponse code for %s' % url, + 'msg_text': '%s returned a status of %s' % (url, r.status_code), + 'aggregation_key': aggregation_key + }) + + def _get_json(self, url, timeout): + # Use a hash of the URL as an aggregation key + aggregation_key = md5(url).hexdigest() + tags = ["url:%s" % url] + msg = None + status = None + try: + r = requests.get(url, timeout=timeout) + if r.status_code != 200: + self._status_code_event(url, r, aggregation_key) + status = AgentCheck.CRITICAL + msg = "Got %s when hitting %s" % (r.status_code, url) + else: + status = AgentCheck.OK + msg = "Mesos master instance detected at %s " % url + except requests.exceptions.Timeout as e: + # If there's a timeout + self._timeout_event(url, timeout, aggregation_key) + msg = "%s seconds timeout when hitting %s" % (timeout, url) + status = AgentCheck.CRITICAL + except Exception as e: + msg = e.message + status = AgentCheck.CRITICAL + finally: + self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, + message=msg) + if status is AgentCheck.CRITICAL: + self.warning(msg) + return None + + return r.json() + + def _get_master_state(self, url, timeout): + return self._get_json(url + '/state.json', timeout) + + def _get_master_stats(self, url, timeout): + return self._get_json(url + '/stats.json', timeout) + + def _get_master_roles(self, url, timeout): + return self._get_json(url + '/roles.json', timeout) + + def _check_leadership(self, url, timeout): + json = self._get_master_state(url, timeout) + + if json is not None and json['leader'] == json['pid']: + return json + return None + + def check(self, instance): + if 'url' not in instance: + raise Exception('Mesos instance missing "url" value.') + + url = instance['url'] + instance_tags = instance.get('tags', []) + default_timeout = self.init_config.get('default_timeout', 5) + timeout = float(instance.get('timeout', default_timeout)) + + json = self._check_leadership(url, timeout) + if json: + tags = ['cluster:' + json['cluster'], 'mesos_pid:' + json['pid'], 'mesos_id:' + json['id']] + instance_tags + + [v[1](self, v[0], json[k], tags=tags) for k, v in self.STATE_METRICS.iteritems()] + + for framework in json['frameworks']: + tags = ['framework:' + framework['id'], 'framework_name:' + framework['name']] + instance_tags + resources = framework['resources'] + [v[1](self, v[0], resources[k], tags=tags) for k, v in self.FRAMEWORK_METRICS.iteritems()] + + json = self._get_master_stats(url, timeout) + if json is not None: + tags = instance_tags + [v[1](self, v[0], json[k], tags=tags) for k, v in self.STATS_METRICS.iteritems()] + + json = self._get_master_roles(url, timeout) + if json is not None: + for role in json['roles']: + tags += ['mesos_role:' + role['name']] + self.GAUGE('mesos.role.frameworks', len(role['frameworks']), tags=tags) + self.GAUGE('mesos.role.weight', role['weight'], tags=tags) + [v[1](self, v[0], role['resources'][k], tags=tags) for k, v in self.ROLE_RESOURCES_METRICS.iteritems()] diff --git a/checks.d/mesos_slave.py b/checks.d/mesos_slave.py new file mode 100644 index 0000000000..e3503f5445 --- /dev/null +++ b/checks.d/mesos_slave.py @@ -0,0 +1,198 @@ +"""Mesos Slave check + +Collects metrics from mesos slave node. +""" +# stdlib +from hashlib import md5 +import time + +# project +from checks import AgentCheck + +# 3rd party +import requests + + +class MesosSlave(AgentCheck): + GAUGE = AgentCheck.gauge + RATE = AgentCheck.rate + SERVICE_CHECK_NAME = "mesos_slave.can_connect" + + TASK_STATUS = { + 'TASK_STARTING' : 0, + 'TASK_RUNNING' : 1, + 'TASK_FINISHED' : 2, + 'TASK_FAILED' : 3, + 'TASK_KILLED' : 4, + 'TASK_LOST' : 5, + 'TASK_STAGING' : 6, + 'TASK_ERROR' : 7, + } + + TASK_METRICS = { + 'cpus' : ('mesos.state.task.cpu', GAUGE), + 'mem' : ('mesos.state.task.mem', GAUGE), + 'disk' : ('mesos.state.task.disk', GAUGE), + } + + STATE_METRICS = { + 'failed_tasks' : ('mesos.state.failed_tasks', GAUGE), + 'finished_tasks' : ('mesos.state.finished_tasks', GAUGE), + 'killed_tasks' : ('mesos.state.killed_tasks', GAUGE), + 'lost_tasks' : ('mesos.state.lost_tasks', GAUGE), + 'staged_tasks' : ('mesos.state.staged_tasks', GAUGE), + 'started_tasks' : ('mesos.state.started_tasks', GAUGE), + } + + STATS_METRICS = { + 'failed_tasks' : ('mesos.stats.failed_tasks', GAUGE), + 'finished_tasks' : ('mesos.stats.finished_tasks', GAUGE), + 'invalid_status_updates' : ('mesos.stats.invalid_status_updates', GAUGE), + 'killed_tasks' : ('mesos.stats.killed_tasks', GAUGE), + 'launched_tasks_gauge' : ('mesos.stats.launched_tasks_gauge', GAUGE), + 'lost_tasks' : ('mesos.stats.lost_tasks', GAUGE), + 'queued_tasks_gauge' : ('mesos.stats.queued_tasks_gauge', GAUGE), + 'recovery_errors' : ('mesos.stats.recovery_errors', GAUGE), + 'registered' : ('mesos.stats.registered', GAUGE), + 'staged_tasks' : ('mesos.stats.staged_tasks', GAUGE), + 'started_tasks' : ('mesos.stats.started_tasks', GAUGE), + 'total_frameworks' : ('mesos.stats.total_frameworks', GAUGE), + 'uptime' : ('mesos.stats.uptime', GAUGE), + 'valid_status_updates' : ('mesos.stats.valid_status_updates', GAUGE), + 'slave/cpus_percent' : ('mesos.stats.slave.cpus_percent', GAUGE), + 'slave/cpus_total' : ('mesos.stats.slave.cpus_total', GAUGE), + 'slave/cpus_used' : ('mesos.stats.slave.cpus_used', GAUGE), + 'slave/disk_percent' : ('mesos.stats.slave.disk_percent', GAUGE), + 'slave/disk_total' : ('mesos.stats.slave.disk_total', GAUGE), + 'slave/disk_used' : ('mesos.stats.slave.disk_used', GAUGE), + 'slave/executors_registering' : ('mesos.stats.slave.executors_registering', GAUGE), + 'slave/executors_running' : ('mesos.stats.slave.executors_running', GAUGE), + 'slave/executors_terminated' : ('mesos.stats.slave.executors_terminated', GAUGE), + 'slave/executors_terminating' : ('mesos.stats.slave.executors_terminating', GAUGE), + 'slave/frameworks_active' : ('mesos.stats.slave.frameworks_active', GAUGE), + 'slave/invalid_framework_messages' : ('mesos.stats.slave.invalid_framework_messages', GAUGE), + 'slave/invalid_status_updates' : ('mesos.stats.slave.invalid_status_updates', GAUGE), + 'slave/mem_percent' : ('mesos.stats.slave.mem_percent', GAUGE), + 'slave/mem_total' : ('mesos.stats.slave.mem_total', GAUGE), + 'slave/mem_used' : ('mesos.stats.slave.mem_used', GAUGE), + 'slave/recovery_errors' : ('mesos.stats.slave.recovery_errors', GAUGE), + 'slave/registered' : ('mesos.stats.slave.registered', GAUGE), + 'slave/tasks_failed' : ('mesos.stats.slave.tasks_failed', GAUGE), + 'slave/tasks_finished' : ('mesos.stats.slave.tasks_finished', GAUGE), + 'slave/tasks_killed' : ('mesos.stats.slave.tasks_killed', GAUGE), + 'slave/tasks_lost' : ('mesos.stats.slave.tasks_lost', GAUGE), + 'slave/tasks_running' : ('mesos.stats.slave.tasks_running', GAUGE), + 'slave/tasks_staging' : ('mesos.stats.slave.tasks_staging', GAUGE), + 'slave/tasks_starting' : ('mesos.stats.slave.tasks_starting', GAUGE), + 'slave/uptime_secs' : ('mesos.stats.slave.uptime_secs', GAUGE), + 'slave/valid_framework_messages' : ('mesos.stats.slave.valid_framework_messages', GAUGE), + 'slave/valid_status_updates' : ('mesos.stats.slave.valid_status_updates', GAUGE), + 'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE), + 'system/load_15min' : ('mesos.stats.system.load_15min', RATE), + 'system/load_1min' : ('mesos.stats.system.load_1min', RATE), + 'system/load_5min' : ('mesos.stats.system.load_5min', RATE), + 'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE), + 'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE), + } + + cluster_name = None + + def _timeout_event(self, url, timeout, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'URL timeout', + 'msg_text': '%s timed out after %s seconds.' % (url, timeout), + 'aggregation_key': aggregation_key + }) + + def _status_code_event(self, url, r, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'Invalid reponse code for %s' % url, + 'msg_text': '%s returned a status of %s' % (url, r.status_code), + 'aggregation_key': aggregation_key + }) + + def _get_json(self, url, timeout): + # Use a hash of the URL as an aggregation key + aggregation_key = md5(url).hexdigest() + tags = ["url:%s" % url] + msg = None + status = None + try: + r = requests.get(url, timeout=timeout) + if r.status_code != 200: + self._status_code_event(url, r, aggregation_key) + status = AgentCheck.CRITICAL + msg = "Got %s when hitting %s" % (r.status_code, url) + else: + status = AgentCheck.OK + msg = "Mesos master instance detected at %s " % url + except requests.exceptions.Timeout as e: + # If there's a timeout + self._timeout_event(url, timeout, aggregation_key) + msg = "%s seconds timeout when hitting %s" % (timeout, url) + status = AgentCheck.CRITICAL + except Exception as e: + msg = e.message + status = AgentCheck.CRITICAL + finally: + self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) + if status is AgentCheck.CRITICAL: + self.warning(msg) + return None + + return r.json() + + def _get_state(self, url, timeout): + return self._get_json(url + '/state.json', timeout) + + def _get_stats(self, url, timeout): + return self._get_json(url + '/stats.json', timeout) + + def _get_constant_attributes(self, url, timeout): + json = None + if self.cluster_name is None: + json = self._get_state(url, timeout) + if json is not None: + master_state = self._get_state('http://' + json['master_hostname'] + ':5050', timeout) + if master_state is not None: + self.cluster_name = master_state['cluster'] + + return json + + def check(self, instance): + if 'url' not in instance: + raise Exception('Mesos instance missing "url" value.') + + url = instance['url'] + instance_tags = instance.get('tags', []) + tasks = instance.get('tasks', []) + default_timeout = self.init_config.get('default_timeout', 5) + timeout = float(instance.get('timeout', default_timeout)) + + json = self._get_constant_attributes(url, timeout) + tags = None + + if json is None: + json = self._get_state(url, timeout) + if json: + tags = ['cluster:' + self.cluster_name, 'mesos_id:' + json['id'], 'mesos_pid:' + json['pid']] + instance_tags + + [v[1](self, v[0], json[k], tags=tags) for k, v in self.STATE_METRICS.iteritems()] + + for task in tasks: + for framework in json['frameworks']: + for executor in framework['executors']: + for t in executor['tasks']: + if task.lower() in t['name'].lower() and t['slave_id'] == json['id']: + task_tags = ['framework_id:' + t['framework_id'], 'executor_id:' + t['executor_id'], 'task_name:' + t['name']] + tags + self.GAUGE('mesos.state.task.status', self.TASK_STATUS[t['state']], tags=task_tags) + [v[1](self, v[0], t['resources'][k], tags=task_tags) for k, v in self.TASK_METRICS.iteritems()] + + json = self._get_stats(url, timeout) + if json: + tags = tags if tags else instance_tags + [v[1](self, v[0], json[k], tags=tags) for k, v in self.STATS_METRICS.iteritems()] diff --git a/conf.d/mesos_master.yaml.example b/conf.d/mesos_master.yaml.example new file mode 100644 index 0000000000..46b68d94cc --- /dev/null +++ b/conf.d/mesos_master.yaml.example @@ -0,0 +1,5 @@ +init_config: + default_timeout: 10 + +instances: + - url: "http://localhost:5050" diff --git a/conf.d/mesos_slave.yaml.example b/conf.d/mesos_slave.yaml.example new file mode 100644 index 0000000000..2d154c4997 --- /dev/null +++ b/conf.d/mesos_slave.yaml.example @@ -0,0 +1,7 @@ +init_config: + default_timeout: 10 + +instances: + - url: "http://localhost:5051" + # tasks: + # - "hello" diff --git a/tests/test_mesos_master.py b/tests/test_mesos_master.py new file mode 100644 index 0000000000..9d00cc7761 --- /dev/null +++ b/tests/test_mesos_master.py @@ -0,0 +1,303 @@ +from tests.common import AgentCheckTest, get_check_class + +from nose.plugins.attrib import attr +from mock import patch +from checks import AgentCheck +import time + +state = { + "version": "0.22.0", + "unregistered_frameworks": [], + "started_tasks": 0, + "start_time": 1428951954.34111, + "staged_tasks": 0, + "slaves": [ + { + "resources": { + "ports": "[31000-32000]", + "mem": 244, + "disk": 35164, + "cpus": 1 + }, + "reregistered_time": 1428951983.53731, + "registered_time": 1428951983.53725, + "pid": "slave(1)@127.0.0.1:5051", + "id": "20150410-134224-16777343-5050-1778-S0", + "hostname": "localhost", + "attributes": {}, + "active": 'true' + } + ], + "pid": "master@127.0.0.1:5050", + "orphan_tasks": [], + "lost_tasks": 0, + "log_dir": "/var/log/mesos", + "leader": "master@127.0.0.1:5050", + "killed_tasks": 0, + "elected_time": 1428951954.3774, + "deactivated_slaves": 0, + "completed_frameworks": [], + "cluster": "datadog-test", + "build_user": "root", + "build_time": 1427376927, + "build_date": "2015-03-26 13:35:27", + "activated_slaves": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "flags": { + "zk_session_timeout": "10secs", + "zk": "zk://localhost:2181/mesos", + "work_dir": "/var/lib/mesos", + "webui_dir": "/usr/share/mesos/webui", + "version": "false", + "user_sorter": "drf", + "slave_reregister_timeout": "10mins", + "root_submissions": "true", + "registry_strict": "false", + "registry_store_timeout": "5secs", + "registry_fetch_timeout": "1mins", + "registry": "replicated_log", + "initialize_driver_logging": "true", + "help": "false", + "framework_sorter": "drf", + "cluster": "datadog-test", + "authenticators": "crammd5", + "authenticate_slaves": "false", + "authenticate": "false", + "allocation_interval": "1secs", + "log_auto_initialize": "true", + "log_dir": "/var/log/mesos", + "logbufsecs": "0", + "logging_level": "INFO", + "port": "5050", + "quiet": "false", + "quorum": "1", + "recovery_slave_removal_limit": "100%" + }, + "frameworks": [ + { + "webui_url": "http://192.168.33.20:8080", + "user": "root", + "offered_resources": { + "mem": 0, + "disk": 0, + "cpus": 0 + }, + "name": "marathon", + "id": "20150403-140128-251789322-5050-6047-0000", + "hostname": "vagrant-ubuntu-trusty-64", + "failover_timeout": 604800, + "completed_tasks": [], + "checkpoint": 'true', + "active": 'true', + "offers": [], + "registered_time": 1428951955.38871, + "reregistered_time": 1428951955.38872, + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "role": "*", + "tasks": [ + { + "statuses": [ + { + "timestamp": 1428673971.61592, + "state": "TASK_RUNNING" + } + ], + "executor_id": "", + "framework_id": "20150403-140128-251789322-5050-6047-0000", + "id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312", + "labels": [], + "name": "hello", + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "slave_id": "20150410-134224-16777343-5050-1778-S0", + "state": "TASK_RUNNING" + } + ], + "unregistered_time": 0, + "used_resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + } + } + ], + "git_sha": "e890e2414903bb69cab730d5204f10b10d2e91bb", + "git_tag": "0.22.0", + "hostname": "localhost", + "id": "20150413-190554-16777343-5050-16324" +} + +stats = { + "valid_status_updates": 0, + "uptime": 706.524240128, + "total_schedulers": 1, + "system/mem_total_bytes": 513798144, + "system/mem_free_bytes": 13815808, + "system/load_5min": 0.02, + "system/load_1min": 0, + "system/load_15min": 0.07, + "system/cpus_total": 1, + "started_tasks": 0, + "staged_tasks": 0, + "registrar/state_store_ms/p9999": 9.90120192, + "registrar/state_store_ms/p999": 9.8956032, + "registrar/state_store_ms/p99": 9.839616, + "registrar/state_store_ms/p95": 9.590784, + "registrar/state_store_ms/p90": 9.279744, + "registrar/state_store_ms/p50": 6.791424, + "registrar/state_store_ms/min": 3.681024, + "registrar/state_store_ms/max": 9.901824, + "registrar/state_store_ms/count": 2, + "registrar/state_store_ms": 9.901824, + "registrar/state_fetch_ms": 3.717888, + "registrar/registry_size_bytes": 246, + "registrar/queued_operations": 0, + "outstanding_offers": 0, + "mem_used": 100, + "mem_total": 244, + "mem_percent": 0.409836065573771, + "master/valid_status_updates": 0, + "master/valid_status_update_acknowledgements": 0, + "master/valid_framework_to_executor_messages": 0, + "master/uptime_secs": 706.52485632, + "master/tasks_starting": 0, + "master/tasks_staging": 0, + "master/tasks_running": 1, + "master/tasks_lost": 0, + "master/tasks_killed": 0, + "master/tasks_finished": 0, + "master/tasks_failed": 0, + "master/tasks_error": 0, + "master/slaves_inactive": 0, + "master/slaves_disconnected": 0, + "master/invalid_framework_to_executor_messages": 0, + "master/frameworks_inactive": 0, + "master/frameworks_disconnected": 0, + "master/frameworks_connected": 1, + "master/frameworks_active": 1, + "master/event_queue_messages": 0, + "master/event_queue_http_requests": 0, + "master/event_queue_dispatches": 17, + "master/elected": 1, + "master/dropped_messages": 1, + "master/disk_used": 0, + "master/disk_total": 35164, + "master/disk_percent": 0, + "master/cpus_used": 1, + "master/cpus_total": 1, + "master/cpus_percent": 1, + "disk_percent": 0, + "deactivated_slaves": 0, + "cpus_used": 1, + "cpus_total": 1, + "cpus_percent": 1, + "active_tasks_gauge": 1, + "active_schedulers": 1, + "activated_slaves": 1, + "disk_total": 35164, + "disk_used": 0, + "elected": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "invalid_status_updates": 0, + "killed_tasks": 0, + "lost_tasks": 0, + "master/invalid_status_update_acknowledgements": 0, + "master/invalid_status_updates": 0, + "master/mem_percent": 0.409836065573771, + "master/mem_total": 244, + "master/mem_used": 100, + "master/messages_authenticate": 0, + "master/messages_deactivate_framework": 0, + "master/messages_decline_offers": 123, + "master/messages_exited_executor": 0, + "master/messages_framework_to_executor": 0, + "master/messages_kill_task": 0, + "master/messages_launch_tasks": 0, + "master/messages_reconcile_tasks": 6, + "master/messages_register_framework": 0, + "master/messages_register_slave": 0, + "master/messages_reregister_framework": 1, + "master/messages_reregister_slave": 2, + "master/messages_resource_request": 0, + "master/messages_revive_offers": 0, + "master/messages_status_update": 0, + "master/messages_status_update_acknowledgement": 0, + "master/messages_unregister_framework": 0, + "master/messages_unregister_slave": 0, + "master/outstanding_offers": 0, + "master/recovery_slave_removals": 0, + "master/slave_registrations": 0, + "master/slave_removals": 0, + "master/slave_reregistrations": 1, + "master/slave_shutdowns_canceled": 0, + "master/slave_shutdowns_scheduled": 0, + "master/slaves_active": 1, + "master/slaves_connected": 1 +} + +roles = { + "roles": [ + { + "weight": 1, + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "name": "*", + "frameworks": [ + "20150403-140128-251789322-5050-6047-0000" + ] + } + ] +} + +def _mocked_get_master_state(*args, **kwargs): + return state +def _mocked_get_master_stats(*args, **kwargs): + return stats +def _mocked_get_master_roles(*args, **kwargs): + return roles + + +@attr(requires='mesos_master') +class TestMesosMaster(AgentCheckTest): + CHECK_NAME = 'mesos_master' + + def test_checks(self): + config = { + 'init_config': {}, + 'instances': [ + { + 'url': 'http://localhost:5050' + } + ] + } + + klass = get_check_class('mesos_master') + with patch.object(klass, '_get_master_state', _mocked_get_master_state): + with patch.object(klass, '_get_master_stats', _mocked_get_master_stats): + with patch.object(klass, '_get_master_roles', _mocked_get_master_roles): + check = klass('mesos_master', {}, {}) + self.run_check(config) + time.sleep(1) + self.run_check(config) + [self.assertMetric(v[0]) for k, v in check.STATE_METRICS.iteritems()] + [self.assertMetric(v[0]) for k, v in check.FRAMEWORK_METRICS.iteritems()] + [self.assertMetric(v[0]) for k, v in check.STATS_METRICS.iteritems()] + [self.assertMetric(v[0]) for k, v in check.ROLE_RESOURCES_METRICS.iteritems()] + self.assertMetric('mesos.role.frameworks') + self.assertMetric('mesos.role.weight') diff --git a/tests/test_mesos_slave.py b/tests/test_mesos_slave.py new file mode 100644 index 0000000000..b355421339 --- /dev/null +++ b/tests/test_mesos_slave.py @@ -0,0 +1,213 @@ +from tests.common import AgentCheckTest, get_check_class + +from nose.plugins.attrib import attr +from mock import patch +from checks import AgentCheck +import time + +state = { + "version": "0.22.0", + "started_tasks": 0, + "start_time": 1428673344.06054, + "staged_tasks": 1, + "cluster": "test", + "resources": { + "ports": "[31000-32000]", + "mem": 244, + "disk": 35164, + "cpus": 1 + }, + "pid": "slave(1)@127.0.0.1:5051", + "master_hostname": "localhost", + "flags": { + "work_dir": "/tmp/mesos", + "version": "false", + "switch_user": "true", + "strict": "true", + "resource_monitoring_interval": "1secs", + "registration_backoff_factor": "1secs", + "recovery_timeout": "15mins", + "recover": "reconnect", + "executor_shutdown_grace_period": "5secs", + "executor_registration_timeout": "1mins", + "enforce_container_disk_quota": "false", + "docker_stop_timeout": "0ns", + "docker_sandbox_directory": "/mnt/mesos/sandbox", + "docker_remove_delay": "6hrs", + "docker": "docker", + "disk_watch_interval": "1mins", + "authenticatee": "crammd5", + "cgroups_enable_cfs": "false", + "cgroups_hierarchy": "/sys/fs/cgroup", + "cgroups_limit_swap": "false", + "cgroups_root": "mesos", + "container_disk_watch_interval": "15secs", + "containerizers": "mesos", + "default_role": "*", + "frameworks_home": "", + "gc_delay": "1weeks", + "gc_disk_headroom": "0.1", + "hadoop_home": "", + "help": "false", + "initialize_driver_logging": "true", + "isolation": "posix/cpu,posix/mem", + "launcher_dir": "/usr/libexec/mesos", + "log_dir": "/var/log/mesos", + "logbufsecs": "0", + "logging_level": "INFO", + "master": "zk://localhost:2181/mesos", + "perf_duration": "10secs", + "perf_interval": "1mins", + "port": "5051", + "quiet": "false" + }, + "finished_tasks": 0, + "failed_tasks": 0, + "completed_frameworks": [], + "build_user": "root", + "build_time": 1427376927, + "build_date": "2015-03-26 13:35:27", + "attributes": {}, + "frameworks": [ + { + "user": "root", + "checkpoint": 'true', + "completed_executors": [], + "executors": [ + { + "tasks": [ + { + "statuses": [ + { + "timestamp": 1428673971.61592, + "state": "TASK_RUNNING" + } + ], + "executor_id": "", + "framework_id": "20150403-140128-251789322-5050-6047-0000", + "id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312", + "labels": [], + "name": "hello", + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "slave_id": "20150410-134224-16777343-5050-1778-S0", + "state": "TASK_RUNNING" + } + ], + "completed_tasks": [], + "container": "f67a5e0b-91f9-474a-94a0-e2c6a3b28ea4", + "directory": "/tmp/mesos/slaves/20150410-134224-16777343-5050-1778-S0/frameworks/20150403-140128-251789322-5050-6047-0000/executors/hello.dc130e23-df88-11e4-b9ec-080027fc1312/runs/f67a5e0b-91f9-474a-94a0-e2c6a3b28ea4", + "id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312", + "name": "Command Executor (Task: hello.dc130e23-df88-11e4-b9ec-080027fc1312) (Command: sh -c 'cd hello && ...')", + "queued_tasks": [], + "resources": { + "ports": "[31915-31915]", + "mem": 132, + "disk": 0, + "cpus": 1.1 + }, + "source": "hello.dc130e23-df88-11e4-b9ec-080027fc1312" + } + ], + "failover_timeout": 604800, + "hostname": "vagrant-ubuntu-trusty-64", + "id": "20150403-140128-251789322-5050-6047-0000", + "name": "marathon", + "role": "*" + } + ], + "git_sha": "e890e2414903bb69cab730d5204f10b10d2e91bb", + "git_tag": "0.22.0", + "hostname": "localhost", + "id": "20150410-134224-16777343-5050-1778-S0", + "killed_tasks": 0, + "log_dir": "/var/log/mesos", + "lost_tasks": 0 +} + +stats = { + "valid_status_updates": 1, + "uptime": 280965.77977984, + "total_frameworks": 1, + "system/mem_total_bytes": 513798144, + "system/mem_free_bytes": 34271232, + "system/load_5min": 0.08, + "system/load_1min": 0.1, + "system/load_15min": 0.06, + "system/cpus_total": 1, + "started_tasks": 0, + "staged_tasks": 1, + "slave/valid_status_updates": 1, + "slave/valid_framework_messages": 0, + "slave/uptime_secs": 280965.78028288, + "slave/tasks_starting": 0, + "slave/tasks_staging": 0, + "slave/executors_registering": 0, + "slave/disk_used": 0, + "slave/disk_total": 35164, + "slave/disk_percent": 0, + "slave/cpus_used": 1.1, + "slave/cpus_total": 1, + "slave/cpus_percent": 1.1, + "registered": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "invalid_status_updates": 0, + "killed_tasks": 0, + "launched_tasks_gauge": 1, + "lost_tasks": 0, + "queued_tasks_gauge": 0, + "recovery_errors": 0, + "slave/executors_running": 1, + "slave/executors_terminated": 0, + "slave/executors_terminating": 0, + "slave/frameworks_active": 1, + "slave/invalid_framework_messages": 0, + "slave/invalid_status_updates": 0, + "slave/mem_percent": 0.540983606557377, + "slave/mem_total": 244, + "slave/mem_used": 132, + "slave/recovery_errors": 0, + "slave/registered": 1, + "slave/tasks_failed": 0, + "slave/tasks_finished": 0, + "slave/tasks_killed": 0, + "slave/tasks_lost": 0, + "slave/tasks_running": 1 +} + +def _mocked_get_state(*args, **kwargs): + return state +def _mocked_get_stats(*args, **kwargs): + return stats + +@attr(requires='mesos_slave') +class TestMesosSlave(AgentCheckTest): + CHECK_NAME = 'mesos_slave' + + def test_checks(self): + config = { + 'init_config': {}, + 'instances': [ + { + 'url': 'http://localhost:5050', + 'tasks': ['hello'] + } + ] + } + + klass = get_check_class('mesos_slave') + with patch.object(klass, '_get_state', _mocked_get_state): + with patch.object(klass, '_get_stats', _mocked_get_stats): + check = klass('mesos_slave', {}, {}) + self.run_check(config) + time.sleep(1) + self.run_check(config) + [self.assertMetric(v[0]) for k, v in check.STATE_METRICS.iteritems()] + [self.assertMetric(v[0]) for k, v in check.TASK_METRICS.iteritems()] + [self.assertMetric(v[0]) for k, v in check.STATS_METRICS.iteritems()] + self.assertMetric('mesos.state.task.status')