diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index 0f371f98bd0a..b7e69c134df8 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -159,6 +159,12 @@ sudo cp {{daemon_base_py2_wheel_path}} $FILESYSTEM_ROOT/$DAEMON_BASE_PY2_WHEEL_N sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install $DAEMON_BASE_PY2_WHEEL_NAME sudo rm -rf $FILESYSTEM_ROOT/$DAEMON_BASE_PY2_WHEEL_NAME +# Install system-health Python 2 package +SYSTEM_HEALTH_PY2_WHEEL_NAME=$(basename {{system_health_py2_wheel_path}}) +sudo cp {{system_health_py2_wheel_path}} $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME +sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install $SYSTEM_HEALTH_PY2_WHEEL_NAME +sudo rm -rf $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME + # Install built Python Click package (and its dependencies via 'apt-get -y install -f') # Do this before installing sonic-utilities so that it doesn't attempt to install # an older version as part of its dependencies @@ -243,6 +249,10 @@ sudo mkdir -p $FILESYSTEM_ROOT/etc/systemd/system/syslog.socket.d sudo cp $IMAGE_CONFIGS/syslog/override.conf $FILESYSTEM_ROOT/etc/systemd/system/syslog.socket.d/override.conf sudo cp $IMAGE_CONFIGS/syslog/host_umount.sh $FILESYSTEM_ROOT/usr/bin/ +# Copy system-health files +sudo LANG=C cp $IMAGE_CONFIGS/system-health/system-health.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM +echo "system-health.service" | sudo tee -a $GENERATED_SERVICE_FILE + # Copy logrotate.d configuration files sudo cp -f $IMAGE_CONFIGS/logrotate/logrotate.d/* $FILESYSTEM_ROOT/etc/logrotate.d/ diff --git a/files/image_config/system-health/system-health.service b/files/image_config/system-health/system-health.service new file mode 100644 index 000000000000..c472aa962c99 --- /dev/null +++ b/files/image_config/system-health/system-health.service @@ -0,0 +1,11 @@ +[Unit] +Description=Monitor system health +Requires=database.service updategraph.service +After=database.service updategraph.service + +[Service] +ExecStart=/usr/local/bin/healthd +Restart=always + +[Install] +WantedBy=multi-user.target diff --git a/rules/system-health.mk b/rules/system-health.mk new file mode 100644 index 000000000000..59292d76bec4 --- /dev/null +++ b/rules/system-health.mk @@ -0,0 +1,9 @@ +# system health python2 wheel + +SYSTEM_HEALTH = system_health-1.0-py2-none-any.whl +$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health +$(SYSTEM_HEALTH)_PYTHON_VERSION = 2 +$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_DAEMON_BASE_PY2) $(SWSSSDK_PY2) $(SONIC_CONFIG_ENGINE) +SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH) + +export system_health_py2_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))" diff --git a/slave.mk b/slave.mk index 06736eafe42e..2d414845c57a 100644 --- a/slave.mk +++ b/slave.mk @@ -786,7 +786,8 @@ $(addprefix $(TARGET_PATH)/, $(SONIC_INSTALLERS)) : $(TARGET_PATH)/% : \ $(addprefix $(PYTHON_WHEELS_PATH)/,$(REDIS_DUMP_LOAD_PY2)) \ $(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_PLATFORM_API_PY2)) \ $(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MODELS_PY3)) \ - $(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MGMT_PY)) + $(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MGMT_PY)) \ + $(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH)) $(HEADER) # Pass initramfs and linux kernel explicitly. They are used for all platforms export debs_path="$(IMAGE_DISTRO_DEBS_PATH)" diff --git a/src/system-health/.gitignore b/src/system-health/.gitignore new file mode 100644 index 000000000000..843dd50ba52f --- /dev/null +++ b/src/system-health/.gitignore @@ -0,0 +1,8 @@ +*/deb_dist/ +*/dist/ +*/build/ +*/*.tar.gz +*/*.egg-info +*/.cache/ +*.pyc +*/__pycache__/ diff --git a/src/system-health/health_checker/__init__.py b/src/system-health/health_checker/__init__.py new file mode 100644 index 000000000000..18c49c8d81a1 --- /dev/null +++ b/src/system-health/health_checker/__init__.py @@ -0,0 +1,2 @@ +from . import hardware_checker +from . import service_checker diff --git a/src/system-health/health_checker/config.py b/src/system-health/health_checker/config.py new file mode 100644 index 000000000000..cfcf3f11c90d --- /dev/null +++ b/src/system-health/health_checker/config.py @@ -0,0 +1,88 @@ +import os +import json +from sonic_device_util import get_machine_info +from sonic_device_util import get_platform_info + + +class Config(object): + DEFAULT_INTERVAL = 60 + DEFAULT_BOOTUP_TIMEOUT = 300 + DEFAULT_LED_CONFIG = { + 'fault': 'red', + 'normal': 'green', + 'booting': 'orange_blink' + } + GET_PLATFORM_CMD = 'sonic-cfggen -d -v DEVICE_METADATA.localhost.platform' + CONFIG_FILE = 'system_health_monitoring_config.json' + + def __init__(self): + mi = get_machine_info() + if mi is not None: + self.platform_name = get_platform_info(mi) + else: + self.platform_name = self._get_platform_name() + self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE) + self._last_mtime = None + self.config_data = None + self.interval = Config.DEFAULT_INTERVAL + self.ignore_services = None + self.ignore_devices = None + self.external_checkers = None + + def load_config(self): + if not os.path.exists(self._config_file): + if self._last_mtime is not None: + self._reset() + return + + mtime = os.stat(self._config_file) + if mtime != self._last_mtime: + try: + self._last_mtime = mtime + with open(self._config_file, 'r') as f: + self.config_data = json.load(f) + + self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL) + self.ignore_services = self._get_list_data('services_to_ignore') + self.ignore_devices = self._get_list_data('devices_to_ignore') + self.external_checkers = self._get_list_data('external_checkers') + except Exception as e: + self._reset() + + def _reset(self): + self._last_mtime = None + self.config_data = None + self.interval = Config.DEFAULT_INTERVAL + self.ignore_services = None + self.ignore_devices = None + self.external_checkers = None + + def get_led_color(self, status): + if self.config_data and 'led_color' in self.config_data: + if status in self.config_data['led_color']: + return self.config_data['led_color'][status] + + return self.DEFAULT_LED_CONFIG[status] + + def get_bootup_timeout(self): + if self.config_data and 'boot_timeout' in self.config_data: + try: + timeout = int(self.config_data['boot_timeout']) + if timeout <= 0: + timeout = self.DEFAULT_BOOTUP_TIMEOUT + return timeout + except ValueError: + pass + return self.DEFAULT_BOOTUP_TIMEOUT + + def _get_platform_name(self): + from .utils import run_command + output = run_command(Config.GET_PLATFORM_CMD) + return output.strip() + + def _get_list_data(self, key): + if key in self.config_data: + data = self.config_data[key] + if isinstance(data, list): + return set(data) + return None diff --git a/src/system-health/health_checker/external_checker.py b/src/system-health/health_checker/external_checker.py new file mode 100644 index 000000000000..4c5372cde4ae --- /dev/null +++ b/src/system-health/health_checker/external_checker.py @@ -0,0 +1,63 @@ +from .health_checker import HealthChecker +from . import utils + + +class ExternalChecker(HealthChecker): + def __init__(self, cmd): + HealthChecker.__init__(self) + self._cmd = cmd + self._category = None + + def reset(self): + self._category = 'External' + self._info = {} + + def get_category(self): + return self._category + + def check(self, config): + self.reset() + + output = utils.run_command(self._cmd) + if not output: + self.set_object_not_ok('External', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd)) + return + + output = output.strip() + if not output: + self.set_object_not_ok('External', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd)) + return + + raw_lines = output.splitlines() + if not raw_lines: + self.set_object_not_ok('External', str(self), 'Invalid output of command \"{}\"'.format(self._cmd)) + return + + lines = [] + for line in raw_lines: + line = line.strip() + if not line: + continue + + lines.append(line) + + if not lines: + self.set_object_not_ok('External', str(self), 'Invalid output of command \"{}\"'.format(self._cmd)) + return + + self._category = lines[0] + if len(lines) > 1: + for line in lines[1:]: + pos = line.find(':') + if pos == -1: + continue + obj_name = line[:pos].strip() + msg = line[pos+1:].strip() + if msg != 'OK': + self.set_object_not_ok('External', obj_name, msg) + else: + self.set_object_ok('External', obj_name) + return + + def __str__(self): + return 'ExternalChecker - {}'.format(self._cmd) diff --git a/src/system-health/health_checker/hardware_checker.py b/src/system-health/health_checker/hardware_checker.py new file mode 100644 index 000000000000..332b1e046d56 --- /dev/null +++ b/src/system-health/health_checker/hardware_checker.py @@ -0,0 +1,222 @@ +from natsort import natsorted +from swsssdk import SonicV2Connector +from .health_checker import HealthChecker + + +class HardwareChecker(HealthChecker): + ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC' + FAN_TABLE_NAME = 'FAN_INFO' + PSU_TABLE_NAME = 'PSU_INFO' + + def __init__(self): + HealthChecker.__init__(self) + self._db = SonicV2Connector(host="127.0.0.1") + self._db.connect(self._db.STATE_DB) + + def get_category(self): + return 'Hardware' + + def check(self, config): + self.reset() + self._check_asic_status(config) + self._check_fan_status(config) + self._check_psu_status(config) + + def _check_asic_status(self, config): + if config.ignore_devices and 'asic' in config.ignore_devices: + return + + temperature = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'temperature') + temperature_threshold = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'high_threshold') + if not temperature: + self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature') + elif not temperature_threshold: + self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature threshold') + else: + try: + temperature = float(temperature) + temperature_threshold = float(temperature_threshold) + if temperature > temperature_threshold: + self.set_object_not_ok('ASIC', 'ASIC', + 'ASIC temperature is too hot, temperature={}, threshold={}'.format( + temperature, + temperature_threshold)) + else: + self.set_object_ok('ASIC', 'ASIC') + except ValueError as e: + self.set_object_not_ok('ASIC', 'ASIC', + 'Invalid ASIC temperature data, temperature={}, threshold={}'.format(temperature, + temperature_threshold)) + + def _check_fan_status(self, config): + if config.ignore_devices and 'fan' in config.ignore_devices: + return + + keys = self._db.keys(self._db.STATE_DB, HardwareChecker.FAN_TABLE_NAME + '*') + if not keys: + self.set_object_not_ok('Fan', 'Fan', 'Failed to get fan information') + return + + for key in natsorted(keys): + key_list = key.split('|') + if len(key_list) != 2: # error data in DB, log it and ignore + self.set_object_not_ok('Fan', key, 'Invalid key for FAN_INFO: {}'.format(key)) + continue + + name = key_list[1] + if config.ignore_devices and name in config.ignore_devices: + continue + data_dict = self._db.get_all(self._db.STATE_DB, key) + presence = data_dict.get('presence', 'false') + if presence.lower() != 'true': + self.set_object_not_ok('Fan', name, '{} is missing'.format(name)) + continue + + status = data_dict.get('status', 'false') + if status.lower() != 'true': + self.set_object_not_ok('Fan', name, '{} is broken'.format(name)) + continue + + if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'): + speed = data_dict.get('speed', None) + speed_target = data_dict.get('speed_target', None) + speed_tolerance = data_dict.get('speed_tolerance', None) + if not speed: + self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name)) + continue + elif not speed_target: + self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name)) + continue + elif not speed_tolerance: + self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name)) + continue + else: + try: + speed = float(speed) + speed_target = float(speed_target) + speed_tolerance = float(speed_tolerance) + speed_min_th = speed_target * (1 - float(speed_tolerance) / 100) + speed_max_th = speed_target * (1 + float(speed_tolerance) / 100) + if speed < speed_min_th or speed > speed_max_th: + self.set_object_not_ok('Fan', name, + '{} speed is out of range, speed={}, range=[{},{}]'.format(name, + speed, + speed_min_th, + speed_max_th)) + continue + except ValueError: + self.set_object_not_ok('Fan', name, + 'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format( + name, + speed, + speed_target, + speed_tolerance)) + continue + + self.set_object_ok('Fan', name) + + def _check_psu_status(self, config): + if config.ignore_devices and 'psu' in config.ignore_devices: + return + + keys = self._db.keys(self._db.STATE_DB, HardwareChecker.PSU_TABLE_NAME + '*') + if not keys: + self.set_object_not_ok('PSU', 'PSU', 'Failed to get PSU information') + return + + for key in natsorted(keys): + key_list = key.split('|') + if len(key_list) != 2: # error data in DB, log it and ignore + self.set_object_not_ok('PSU', key, 'Invalid key for PSU_INFO: {}'.format(key)) + continue + + name = key_list[1] + if config.ignore_devices and name in config.ignore_devices: + continue + + data_dict = self._db.get_all(self._db.STATE_DB, key) + presence = data_dict.get('presence', 'false') + if presence.lower() != 'true': + self.set_object_not_ok('PSU', name, '{} is missing or not available'.format(name)) + continue + + status = data_dict.get('status', 'false') + if status.lower() != 'true': + self.set_object_not_ok('PSU', name, '{} is out of power'.format(name)) + continue + + if not self._ignore_check(config.ignore_devices, 'psu', name, 'temperature'): + temperature = data_dict.get('temp', None) + temperature_threshold = data_dict.get('temp_th', None) + if temperature is None: + self.set_object_not_ok('PSU', name, 'Failed to get temperature data for {}'.format(name)) + continue + elif temperature_threshold is None: + self.set_object_not_ok('PSU', name, 'Failed to get temperature threshold data for {}'.format(name)) + continue + else: + try: + temperature = float(temperature) + temperature_threshold = float(temperature_threshold) + if temperature > temperature_threshold: + self.set_object_not_ok('PSU', name, + '{} temperature is too hot, temperature={}, threshold={}'.format( + name, temperature, + temperature_threshold)) + continue + except ValueError: + self.set_object_not_ok('PSU', name, + 'Invalid temperature data for {}, temperature={}, threshold={}'.format( + name, temperature, + temperature_threshold)) + continue + + if not self._ignore_check(config.ignore_devices, 'psu', name, 'voltage'): + voltage = data_dict.get('voltage', None) + voltage_min_th = data_dict.get('voltage_min_th', None) + voltage_max_th = data_dict.get('voltage_max_th', None) + if voltage is None: + self.set_object_not_ok('PSU', name, 'Failed to get voltage data for {}'.format(name)) + continue + elif voltage_min_th is None: + self.set_object_not_ok('PSU', name, + 'Failed to get voltage minimum threshold data for {}'.format(name)) + continue + elif voltage_max_th is None: + self.set_object_not_ok('PSU', name, + 'Failed to get voltage maximum threshold data for {}'.format(name)) + continue + else: + try: + voltage = float(voltage) + voltage_min_th = float(voltage_min_th) + voltage_max_th = float(voltage_max_th) + if voltage < voltage_min_th or voltage > voltage_max_th: + self.set_object_not_ok('PSU', name, + '{} voltage is out of range, voltage={}, range=[{},{}]'.format(name, + voltage, + voltage_min_th, + voltage_max_th)) + continue + except ValueError: + self.set_object_not_ok('PSU', name, + 'Invalid voltage data for {}, voltage={}, range=[{},{}]'.format(name, + voltage, + voltage_min_th, + voltage_max_th)) + continue + self.set_object_ok('PSU', name) + + def reset(self): + self._info = {} + + @classmethod + def _ignore_check(cls, ignore_set, category, object_name, check_point): + if not ignore_set: + return False + + if '{}.{}'.format(category, check_point) in ignore_set: + return True + elif '{}.{}'.format(object_name, check_point) in ignore_set: + return True + return False diff --git a/src/system-health/health_checker/health_checker.py b/src/system-health/health_checker/health_checker.py new file mode 100644 index 000000000000..24c8ef4043ab --- /dev/null +++ b/src/system-health/health_checker/health_checker.py @@ -0,0 +1,41 @@ +class HealthChecker(object): + INFO_FIELD_OBJECT_TYPE = 'type' + INFO_FIELD_OBJECT_STATUS = 'status' + INFO_FIELD_OBJECT_MSG = 'message' + + STATUS_OK = 'OK' + STATUS_NOT_OK = 'Not OK' + + def __init__(self): + self._info = {} + + def reset(self): + pass + + def get_category(self): + pass + + def get_info(self): + return self._info + + def check(self, config): + pass + + def __str__(self): + return self.__class__.__name__ + + def add_info(self, object_name, key, value): + if object_name not in self._info: + self._info[object_name] = {} + + self._info[object_name][key] = value + + def set_object_not_ok(self, object_type, object_name, message): + self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type) + self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, message) + self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_NOT_OK) + + def set_object_ok(self, object_type, object_name): + self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type) + self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, '') + self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_OK) diff --git a/src/system-health/health_checker/manager.py b/src/system-health/health_checker/manager.py new file mode 100644 index 000000000000..0881ae027490 --- /dev/null +++ b/src/system-health/health_checker/manager.py @@ -0,0 +1,68 @@ +import time + + +class HealthCheckerManager(object): + STATE_BOOTING = 'booting' + STATE_RUNNING = 'running' + + def __init__(self): + self._checkers = [] + self._state = self.STATE_BOOTING + + from .config import Config + self.config = Config() + self.initialize() + + def initialize(self): + from .service_checker import ServiceChecker + from .hardware_checker import HardwareChecker + self._checkers.append(ServiceChecker()) + self._checkers.append(HardwareChecker()) + + def check(self): + stats = {} + self.config.load_config() + # check state first to avoid user change boot timeout in configuration file + # after finishing system boot + if self._state == self.STATE_BOOTING and self._is_system_booting(): + return self._state, stats + + for checker in self._checkers: + self._do_check(checker, stats) + + if self.config.external_checkers: + from .external_checker import ExternalChecker + for external_checker in self.config.external_checkers: + checker = ExternalChecker(external_checker) + self._do_check(checker, stats) + return self._state, stats + + def _do_check(self, checker, stats): + try: + checker.check(self.config) + category = checker.get_category() + info = checker.get_info() + if category not in stats: + stats[category] = info + else: + stats[category].update(info) + except Exception as e: + from .health_checker import HealthChecker + error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e)) + entry = {str(checker): { + HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK, + HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg + }} + if 'Internal' not in stats: + stats['Internal'] = entry + else: + stats['Internal'].update(entry) + + def _is_system_booting(self): + from .utils import get_uptime + uptime = get_uptime() + timeout = self.config.get_bootup_timeout() + booting = uptime < timeout + if not booting: + self._state = self.STATE_RUNNING + return booting diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py new file mode 100644 index 000000000000..cef5d595cbcc --- /dev/null +++ b/src/system-health/health_checker/service_checker.py @@ -0,0 +1,55 @@ +import subprocess +from .health_checker import HealthChecker +from . import utils + + +class ServiceChecker(HealthChecker): + CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service' + CHECK_CMD = 'monit summary -B' + MIN_CHECK_CMD_LINES = 3 + EXPECT_STATUS_DICT = { + 'System': 'Running', + 'Process': 'Running', + 'Filesystem': 'Accessible' + } + + def __init__(self): + HealthChecker.__init__(self) + + def reset(self): + self._info = {} + + def get_category(self): + return 'Services' + + def check(self, config): + self.reset() + output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip() + if output != 'active': + self.set_object_not_ok('Service', 'monit', 'monit service is not running') + return + + output = utils.run_command(ServiceChecker.CHECK_CMD) + lines = output.splitlines() + if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES: + self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible') + return + + status_begin = lines[1].find('Status') + type_begin = lines[1].find('Type') + if status_begin < 0 or type_begin < 0: + self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible') + return + + for line in lines[2:]: + name = line[0:status_begin].strip() + if config.ignore_services and name in config.ignore_services: + continue + status = line[status_begin:type_begin].strip() + service_type = line[type_begin:].strip() + expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type] + if expect_status != status: + self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status)) + else: + self.set_object_ok(service_type, name) + return diff --git a/src/system-health/health_checker/system_health_monitoring_config.json b/src/system-health/health_checker/system_health_monitoring_config.json new file mode 100644 index 000000000000..ee9f7ab9bc84 --- /dev/null +++ b/src/system-health/health_checker/system_health_monitoring_config.json @@ -0,0 +1,12 @@ +{ + "services_to_ignore": [], + "devices_to_ignore": [], + "external_checkers": [], + "polling_interval": 60, + "led_color": { + "fault": "amber", + "normal": "green", + "booting": "orange_blink" + }, + "boot_timeout": 300 +} \ No newline at end of file diff --git a/src/system-health/health_checker/utils.py b/src/system-health/health_checker/utils.py new file mode 100644 index 000000000000..e29afd502017 --- /dev/null +++ b/src/system-health/health_checker/utils.py @@ -0,0 +1,16 @@ +import subprocess + + +def run_command(command): + try: + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) + return process.communicate()[0].encode('utf-8') + except: + return None + + +def get_uptime(): + with open('/proc/uptime', 'r') as f: + uptime_seconds = float(f.readline().split()[0]) + + return uptime_seconds diff --git a/src/system-health/pytest.ini b/src/system-health/pytest.ini new file mode 100644 index 000000000000..c24fe5bb9e65 --- /dev/null +++ b/src/system-health/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +filterwarnings = + ignore::DeprecationWarning diff --git a/src/system-health/scripts/healthd b/src/system-health/scripts/healthd new file mode 100644 index 000000000000..1773e589d43f --- /dev/null +++ b/src/system-health/scripts/healthd @@ -0,0 +1,108 @@ +#!/usr/bin/env python2 + +""" + healthd + System health monitor daemon for SONiC +""" + +import signal +import threading +from sonic_daemon_base.daemon_base import Logger +from sonic_daemon_base.daemon_base import DaemonBase +from swsssdk import SonicV2Connector +from health_checker.manager import HealthCheckerManager + +SYSLOG_IDENTIFIER = 'healthd' +logger = Logger(SYSLOG_IDENTIFIER) + + +class HealthDaemon(DaemonBase): + SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO' + + def __init__(self): + """ + Constructor of HealthDaemon + """ + DaemonBase.__init__(self) + self._db = SonicV2Connector(host="127.0.0.1") + self._db.connect(self._db.STATE_DB) + self.stop_event = threading.Event() + + def deinit(self): + self._clear_system_health_table() + + def _clear_system_health_table(self): + self._db.delete_all_by_pattern(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME) + + # Signal handler + def signal_handler(self, sig, frame): + """ + Signal handler + :param sig: Signal number + :param frame: not used + :return: + """ + if sig == signal.SIGHUP: + logger.log_info("Caught SIGHUP - ignoring...") + elif sig == signal.SIGINT: + logger.log_info("Caught SIGINT - exiting...") + self.stop_event.set() + elif sig == signal.SIGTERM: + logger.log_info("Caught SIGTERM - exiting...") + self.stop_event.set() + else: + logger.log_warning("Caught unhandled signal '" + sig + "'") + + def run(self): + logger.log_info("Starting up...") + + import sonic_platform.platform + chassis = sonic_platform.platform.Platform().get_chassis() + manager = HealthCheckerManager() + while 1: + state, stat = manager.check() + if state == HealthCheckerManager.STATE_RUNNING: + self._process_stat(chassis, manager.config, stat) + else: + self._set_system_led(chassis, manager.config, 'booting') + + if self.stop_event.wait(manager.config.interval): + break + + self.deinit() + + def _process_stat(self, chassis, config, stat): + from health_checker.health_checker import HealthChecker + self._clear_system_health_table() + status = True + for category, info in stat.items(): + for obj_name, obj_data in info.items(): + if obj_data[HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK: + status = False + self._db.set(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME, obj_name, + obj_data[HealthChecker.INFO_FIELD_OBJECT_MSG]) + + summary = 'OK' if status else 'Not OK' + self._db.set(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME, 'summary', summary) + led_status = 'normal' if status else 'fault' + self._set_system_led(chassis, config, led_status) + + def _set_system_led(self, chassis, config, status): + try: + chassis.set_status_led(config.get_led_color(status)) + except NotImplementedError: + logger.log_info('chassis.set_status_led is not implemented') + except Exception as e: + logger.log_error('Failed to set system led due to - {}'.format(repr(e))) + + +# +# Main ========================================================================= +# +def main(): + thermal_control = HealthDaemon() + thermal_control.run() + + +if __name__ == '__main__': + main() diff --git a/src/system-health/setup.cfg b/src/system-health/setup.cfg new file mode 100644 index 000000000000..b7e478982ccf --- /dev/null +++ b/src/system-health/setup.cfg @@ -0,0 +1,2 @@ +[aliases] +test=pytest diff --git a/src/system-health/setup.py b/src/system-health/setup.py new file mode 100644 index 000000000000..7204475a337e --- /dev/null +++ b/src/system-health/setup.py @@ -0,0 +1,42 @@ +from setuptools import setup + +setup( + name='system-health', + version='1.0', + description='SONiC system health package', + license='Apache 2.0', + author='SONiC Team', + author_email='linuxnetdev@microsoft.com', + url='https://github.com/Azure/sonic-buildimage', + maintainer='Junchao Chen', + maintainer_email='junchaow@mellanox.com', + packages=[ + 'health_checker', + 'tests' + ], + scripts=[ + 'scripts/healthd', + ], + setup_requires= [ + 'pytest-runner' + ], + tests_require = [ + 'pytest', + 'mock>=2.0.0' + ], + classifiers=[ + 'Development Status :: 4 - Beta', + 'Environment :: No Input/Output (Daemon)', + 'Intended Audience :: Developers', + 'Intended Audience :: Information Technology', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: Apache Software License', + 'Natural Language :: English', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python :: 2.7', + 'Topic :: System :: Hardware', + ], + keywords='SONiC sonic HEALTH health', + test_suite='setup.get_test_suite' +) + diff --git a/src/system-health/tests/__init__.py b/src/system-health/tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/system-health/tests/mock_connector.py b/src/system-health/tests/mock_connector.py new file mode 100644 index 000000000000..c65198dbe145 --- /dev/null +++ b/src/system-health/tests/mock_connector.py @@ -0,0 +1,25 @@ +class MockConnector(object): + STATE_DB = None + data = {} + + def __init__(self, host): + pass + + def connect(self, db_id): + pass + + def get(self, db_id, key, field): + return MockConnector.data[key][field] + + def keys(self, db_id, pattern): + match = pattern.split('*')[0] + ret = [] + for key in MockConnector.data.keys(): + if match in key: + ret.append(key) + + return ret + + def get_all(self, db_id, key): + return MockConnector.data[key] + diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py new file mode 100644 index 000000000000..046cb191f1b0 --- /dev/null +++ b/src/system-health/tests/test_system_health.py @@ -0,0 +1,218 @@ +import os +import sys +from .mock_connector import MockConnector +import swsssdk +swsssdk.SonicV2Connector = MockConnector +from mock import Mock, MagicMock, patch +#from sonic_daemon_base import daemon_base + + +#daemon_base.db_connect = MagicMock() + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +#lib_path = os.path.join(modules_path, "health_checker") +sys.path.insert(0, modules_path) +from health_checker.manager import HealthCheckerManager + +#from imp import load_source + +#load_source('thermalctld', scripts_path + '/thermalctld') +#from thermalctld import * + + +def test_external_checker(): + from health_checker import utils + utils.run_command = MagicMock(return_value='') + + from health_checker.external_checker import ExternalChecker + from health_checker.health_checker import HealthChecker + checker = ExternalChecker('') + checker.check(None) + assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + checker.reset() + assert len(checker._info) == 0 + + utils.run_command = MagicMock(return_value='\n\n\n') + checker.check(None) + assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + valid_output = 'MyCategory\nDevice1:OK\nDevice2:Device2 is broken\n' + utils.run_command = MagicMock(return_value=valid_output) + checker.check(None) + assert 'Device1' in checker._info + assert 'Device2' in checker._info + assert checker._info['Device1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + assert checker._info['Device2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + +def test_service_checker(): + from health_checker import utils + from health_checker.config import Config + from health_checker.health_checker import HealthChecker + from health_checker.service_checker import ServiceChecker + return_value = '' + def mock_run_command(cmd): + if cmd == ServiceChecker.CHECK_MONIT_SERVICE_CMD: + return 'active' + else: + return return_value + + utils.run_command = mock_run_command + return_value = 'Monit 5.20.0 uptime: 3h 54m\n' \ + 'Service Name Status Type\n' \ + 'sonic Running System\n' \ + 'sonic1 Not running System\n' \ + 'telemetry Does not exist Process\n' \ + 'orchagent Running Process\n' \ + 'root-overlay Accessible Filesystem\n' \ + 'var-log Is not accessible Filesystem\n' \ + + checker = ServiceChecker() + config = Config() + checker.check(config) + assert 'sonic' in checker._info + assert checker._info['sonic'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'sonic1' in checker._info + assert checker._info['sonic1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'orchagent' in checker._info + assert checker._info['orchagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'telemetry' in checker._info + assert checker._info['telemetry'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'root-overlay' in checker._info + assert checker._info['root-overlay'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'var-log' in checker._info + assert checker._info['var-log'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + +def test_hardware_checker(): + from health_checker import utils + from health_checker.config import Config + from health_checker.health_checker import HealthChecker + from health_checker.hardware_checker import HardwareChecker + + MockConnector.data.update({ + 'TEMPERATURE_INFO|ASIC': { + 'temperature': '20', + 'high_threshold': '21' + } + }) + + MockConnector.data.update({ + 'FAN_INFO|fan1': { + 'presence': 'True', + 'status': 'True', + 'speed': '60', + 'speed_target': '60', + 'speed_tolerance': '20' + }, + 'FAN_INFO|fan2': { + 'presence': 'False', + 'status': 'True', + 'speed': '60', + 'speed_target': '60', + 'speed_tolerance': '20' + }, + 'FAN_INFO|fan3': { + 'presence': 'True', + 'status': 'False', + 'speed': '60', + 'speed_target': '60', + 'speed_tolerance': '20' + }, + 'FAN_INFO|fan4': { + 'presence': 'True', + 'status': 'True', + 'speed': '20', + 'speed_target': '60', + 'speed_tolerance': '20' + } + }) + + MockConnector.data.update({ + 'PSU_INFO|PSU 1': { + 'presence': 'True', + 'status': 'True', + 'temp': '55', + 'temp_th': '100', + 'voltage': '10', + 'voltage_min_th': '8', + 'voltage_max_th': '15', + }, + 'PSU_INFO|PSU 2': { + 'presence': 'False', + 'status': 'True', + 'temp': '55', + 'temp_th': '100', + 'voltage': '10', + 'voltage_min_th': '8', + 'voltage_max_th': '15', + }, + 'PSU_INFO|PSU 3': { + 'presence': 'True', + 'status': 'False', + 'temp': '55', + 'temp_th': '100', + 'voltage': '10', + 'voltage_min_th': '8', + 'voltage_max_th': '15', + }, + 'PSU_INFO|PSU 4': { + 'presence': 'True', + 'status': 'True', + 'temp': '101', + 'temp_th': '100', + 'voltage': '10', + 'voltage_min_th': '8', + 'voltage_max_th': '15', + }, + 'PSU_INFO|PSU 5': { + 'presence': 'True', + 'status': 'True', + 'temp': '55', + 'temp_th': '100', + 'voltage': '10', + 'voltage_min_th': '12', + 'voltage_max_th': '15', + } + }) + + checker = HardwareChecker() + config = Config() + checker.check(config) + + assert 'ASIC' in checker._info + assert checker._info['ASIC'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'fan1' in checker._info + assert checker._info['fan1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'fan2' in checker._info + assert checker._info['fan2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'fan3' in checker._info + assert checker._info['fan3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'fan4' in checker._info + assert checker._info['fan4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'PSU 1' in checker._info + assert checker._info['PSU 1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'PSU 2' in checker._info + assert checker._info['PSU 2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'PSU 3' in checker._info + assert checker._info['PSU 3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'PSU 4' in checker._info + assert checker._info['PSU 4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'PSU 5' in checker._info + assert checker._info['PSU 5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK