From 1fba66c5d41a19ce9b02f64cd2091c0783da0a78 Mon Sep 17 00:00:00 2001 From: ganglv <88995770+ganglyu@users.noreply.github.com> Date: Tue, 18 Jun 2024 10:13:58 +0800 Subject: [PATCH] Improve service checker for gnmi/telemetry container (#19153) Why I did it Fix #19081 We have used gnmi container to replace telemetry container, and telemetry is still enabled after upgrade. service_checker script reads from features table and check if the container is running, telemetry is enabled but there's no telemetry container. It's difficult to disable telemetry in feature table for warm reboot and cold reboot, we need to check docker image in db migrator and minigraph.py. When we use warm reboot to upgrade from 202305 to 202311, config_db still has telemetry configuration, and we can't simply remove related configuration. Work item tracking Microsoft ADO (number only): How I did it I modify service_checker script: If there's docker-sonic-telemetry image, check telemetry container. If there's no docker-sonic-telemetry image, check gnmi container instead. If there's no docker-sonic-telemetry image and docker-sonic-gnmi image, do not check telemetry. How to verify it Run unit test and end to end test. --- .../health_checker/service_checker.py | 47 +++++++++++++++--- .../etc/supervisor/critical_processes | 1 + src/system-health/tests/test_system_health.py | 49 +++++++++++++++++++ 3 files changed, 89 insertions(+), 8 deletions(-) create mode 100644 src/system-health/tests/telemetry/etc/supervisor/critical_processes diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index bc38bd43a929..10ec7e678595 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -15,6 +15,19 @@ EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_TAG = "process-not-running" +def check_docker_image(image_name): + """ + @summary: This function will check if docker image exists. + @return: True if the image exists, otherwise False. + """ + try: + DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock') + DOCKER_CLIENT.images.get(image_name) + return True + except (docker.errors.ImageNotFound, docker.errors.APIError) as err: + logger.log_warning("Failed to get image '{}'. Error: '{}'".format(image_name, err)) + return False + class ServiceChecker(HealthChecker): """ Checker that checks critical system service status via monit service. @@ -84,21 +97,39 @@ def get_expected_running_containers(self, feature_table): # it will be removed from exception list. run_all_instance_list = ['database', 'bgp'] - for feature_name, feature_entry in feature_table.items(): + container_list = [] + for container_name in feature_table.keys(): + # slim image does not have telemetry container and corresponding docker image + if container_name == "telemetry": + ret = check_docker_image("docker-sonic-telemetry") + if not ret: + # If telemetry container image is not present, check gnmi container image + # If gnmi container image is not present, ignore telemetry container check + # if gnmi container image is present, check gnmi container instead of telemetry + ret = check_docker_image("docker-sonic-gnmi") + if not ret: + logger.log_debug("Ignoring telemetry container check on image which has no corresponding docker image") + else: + container_list.append("gnmi") + continue + container_list.append(container_name) + + for container_name in container_list: + feature_entry = feature_table[container_name] if feature_entry["state"] not in ["disabled", "always_disabled"]: if multi_asic.is_multi_asic(): if feature_entry.get("has_global_scope", "True") == "True": - expected_running_containers.add(feature_name) - container_feature_dict[feature_name] = feature_name + expected_running_containers.add(container_name) + container_feature_dict[container_name] = container_name if feature_entry.get("has_per_asic_scope", "False") == "True": num_asics = multi_asic.get_num_asics() for asic_id in range(num_asics): - if asic_id in asics_id_presence or feature_name in run_all_instance_list: - expected_running_containers.add(feature_name + str(asic_id)) - container_feature_dict[feature_name + str(asic_id)] = feature_name + if asic_id in asics_id_presence or container_name in run_all_instance_list: + expected_running_containers.add(container_name + str(asic_id)) + container_feature_dict[container_name + str(asic_id)] = container_name else: - expected_running_containers.add(feature_name) - container_feature_dict[feature_name] = feature_name + expected_running_containers.add(container_name) + container_feature_dict[container_name] = container_name if device_info.is_supervisor(): expected_running_containers.add("database-chassis") diff --git a/src/system-health/tests/telemetry/etc/supervisor/critical_processes b/src/system-health/tests/telemetry/etc/supervisor/critical_processes new file mode 100644 index 000000000000..fd693f80070d --- /dev/null +++ b/src/system-health/tests/telemetry/etc/supervisor/critical_processes @@ -0,0 +1 @@ +program:gnmi-native diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index ab44fcd66c9d..4aeb3b18f041 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -12,6 +12,7 @@ import copy import os import sys +import docker from imp import load_source from swsscommon import swsscommon @@ -23,6 +24,7 @@ swsscommon.SonicV2Connector = MockConnector test_path = os.path.dirname(os.path.abspath(__file__)) +telemetry_path = os.path.join(test_path, 'telemetry') modules_path = os.path.dirname(test_path) scripts_path = os.path.join(modules_path, 'scripts') sys.path.insert(0, modules_path) @@ -166,6 +168,53 @@ def test_service_checker_single_asic(mock_config_db, mock_run, mock_docker_clien assert origin_container_critical_processes == checker.container_critical_processes +@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) +@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=telemetry_path)) +@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False)) +@patch('docker.DockerClient') +@patch('health_checker.utils.run_command') +@patch('swsscommon.swsscommon.ConfigDBConnector') +def test_service_checker_telemetry(mock_config_db, mock_run, mock_docker_client): + setup() + mock_db_data = MagicMock() + mock_get_table = MagicMock() + mock_db_data.get_table = mock_get_table + mock_config_db.return_value = mock_db_data + mock_get_table.return_value = { + 'gnmi': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + + }, + 'telemetry': { + 'state': 'enabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + + } + } + mock_containers = MagicMock() + mock_gnmi_container = MagicMock() + mock_gnmi_container.name = 'gnmi' + mock_containers.list = MagicMock(return_value=[mock_gnmi_container]) + mock_docker_client_object = MagicMock() + mock_docker_client.return_value = mock_docker_client_object + mock_docker_client_object.containers = mock_containers + mock_docker_client_object.images = MagicMock() + mock_docker_client_object.images.get = MagicMock() + except_err = docker.errors.ImageNotFound("Unit test") + mock_docker_client_object.images.get.side_effect = [except_err, None] + + mock_run.return_value = "gnmi-native RUNNING pid 67, uptime 1:03:56" + + checker = ServiceChecker() + assert checker.get_category() == 'Services' + config = Config() + checker.check(config) + assert 'gnmi:gnmi-native' in checker._info + assert checker._info['gnmi:gnmi-native'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + @patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock()) @patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path))