diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index bc38bd43a929..10ec7e678595 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -15,6 +15,19 @@ EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_TAG = "process-not-running" +def check_docker_image(image_name): + """ + @summary: This function will check if docker image exists. + @return: True if the image exists, otherwise False. + """ + try: + DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock') + DOCKER_CLIENT.images.get(image_name) + return True + except (docker.errors.ImageNotFound, docker.errors.APIError) as err: + logger.log_warning("Failed to get image '{}'. Error: '{}'".format(image_name, err)) + return False + class ServiceChecker(HealthChecker): """ Checker that checks critical system service status via monit service. @@ -84,21 +97,39 @@ def get_expected_running_containers(self, feature_table): # it will be removed from exception list. run_all_instance_list = ['database', 'bgp'] - for feature_name, feature_entry in feature_table.items(): + container_list = [] + for container_name in feature_table.keys(): + # slim image does not have telemetry container and corresponding docker image + if container_name == "telemetry": + ret = check_docker_image("docker-sonic-telemetry") + if not ret: + # If telemetry container image is not present, check gnmi container image + # If gnmi container image is not present, ignore telemetry container check + # if gnmi container image is present, check gnmi container instead of telemetry + ret = check_docker_image("docker-sonic-gnmi") + if not ret: + logger.log_debug("Ignoring telemetry container check on image which has no corresponding docker image") + else: + container_list.append("gnmi") + continue + container_list.append(container_name) + + for container_name in container_list: + feature_entry = feature_table[container_name] if feature_entry["state"] not in ["disabled", "always_disabled"]: if multi_asic.is_multi_asic(): if feature_entry.get("has_global_scope", "True") == "True": - expected_running_containers.add(feature_name) - container_feature_dict[feature_name] = feature_name + expected_running_containers.add(container_name) + container_feature_dict[container_name] = container_name if feature_entry.get("has_per_asic_scope", "False") == "True": num_asics = multi_asic.get_num_asics() for asic_id in range(num_asics): - if asic_id in asics_id_presence or feature_name in run_all_instance_list: - expected_running_containers.add(feature_name + str(asic_id)) - container_feature_dict[feature_name + str(asic_id)] = feature_name + if asic_id in asics_id_presence or container_name in run_all_instance_list: + expected_running_containers.add(container_name + str(asic_id)) + container_feature_dict[container_name + str(asic_id)] = container_name else: - expected_running_containers.add(feature_name) - container_feature_dict[feature_name] = feature_name + expected_running_containers.add(container_name) + container_feature_dict[container_name] = container_name if device_info.is_supervisor(): expected_running_containers.add("database-chassis") diff --git a/src/system-health/tests/telemetry/etc/supervisor/critical_processes b/src/system-health/tests/telemetry/etc/supervisor/critical_processes new file mode 100644 index 000000000000..fd693f80070d --- /dev/null +++ b/src/system-health/tests/telemetry/etc/supervisor/critical_processes @@ -0,0 +1 @@ +program:gnmi-native diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index ab44fcd66c9d..4aeb3b18f041 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -12,6 +12,7 @@ import copy import os import sys +import docker from imp import load_source from swsscommon import swsscommon @@ -23,6 +24,7 @@ swsscommon.SonicV2Connector = MockConnector test_path = os.path.dirname(os.path.abspath(__file__)) +telemetry_path = os.path.join(test_path, 'telemetry') modules_path = os.path.dirname(test_path) scripts_path = os.path.join(modules_path, 'scripts') sys.path.insert(0, modules_path) @@ -166,6 +168,53 @@ def test_service_checker_single_asic(mock_config_db, mock_run, mock_docker_clien assert origin_container_critical_processes == checker.container_critical_processes +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) +@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=telemetry_path)) +@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) +@patch('docker.DockerClient') +@patch('health_checker.utils.run_command') +@patch('swsscommon.swsscommon.ConfigDBConnector') +def test_service_checker_telemetry(mock_config_db, mock_run, mock_docker_client): + setup() + mock_db_data = MagicMock() + mock_get_table = MagicMock() + mock_db_data.get_table = mock_get_table + mock_config_db.return_value = mock_db_data + mock_get_table.return_value = { + 'gnmi': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + + }, + 'telemetry': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + + } + } + mock_containers = MagicMock() + mock_gnmi_container = MagicMock() + mock_gnmi_container.name = 'gnmi' + mock_containers.list = MagicMock(return_value=[mock_gnmi_container]) + mock_docker_client_object = MagicMock() + mock_docker_client.return_value = mock_docker_client_object + mock_docker_client_object.containers = mock_containers + mock_docker_client_object.images = MagicMock() + mock_docker_client_object.images.get = MagicMock() + except_err = docker.errors.ImageNotFound("Unit test") + mock_docker_client_object.images.get.side_effect = [except_err, None] + + mock_run.return_value = "gnmi-native RUNNING pid 67, uptime 1:03:56" + + checker = ServiceChecker() + assert checker.get_category() == 'Services' + config = Config() + checker.check(config) + assert 'gnmi:gnmi-native' in checker._info + assert checker._info['gnmi:gnmi-native'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + @patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) @patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path))