implement memory utilization fixture (sonic-net#13698)

What is the motivation for this PR? Test gap for memory leak How did you do it? Introduced a new fixture to collect the memory information before and after the test case. Then compare the memory information to confirm that it has not exceeded the high memory usage threshold and that no memory leaks have occurred. How did you verify/test it? Run case locally. Set a fake threshold, check the result. Run nightly pipeline.
arista-hpandya · Oct 2, 2024 · 0fe4eb7 · 0fe4eb7
1 parent 3145843
commit 0fe4eb7
Show file tree

Hide file tree

Showing 6 changed files with 1,089 additions and 1 deletion.
diff --git a/tests/common/plugins/memory_utilization/README.md b/tests/common/plugins/memory_utilization/README.md
diff --git a/tests/common/plugins/memory_utilization/__init__.py b/tests/common/plugins/memory_utilization/__init__.py
@@ -0,0 +1,96 @@
+import logging
+import pytest
+from tests.common.plugins.memory_utilization.memory_utilization import MemoryMonitor
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--disable_memory_utilization",
+        action="store_true",
+        default=False,
+        help="Disable memory utilization analysis for the 'memory_utilization' fixture"
+    )
+
+
+@pytest.fixture(scope="function", autouse=True)
+def store_fixture_values(request, duthosts, memory_utilization):
+    logging.info("store memory_utilization {}".format(request.node.name))
+    request.config.store_duthosts = duthosts
+    request.config.store_memory_utilization = memory_utilization
+
+
+@pytest.hookimpl(trylast=True)
+def pytest_runtest_setup(item):
+    logging.info("collect memory before test {}".format(item.name))
+
+    duthosts = getattr(item.config, 'store_duthosts', None)
+    memory_utilization = getattr(item.config, 'store_memory_utilization', None)
+    if duthosts is None and memory_utilization is None:
+        return
+
+    memory_monitors, memory_values = memory_utilization
+
+    logging.debug("memory_values {} ".format(memory_values))
+
+    for duthost in duthosts:
+        if duthost.topo_type == 't2':
+            continue
+
+        # Initial memory check for all registered commands
+        for name, cmd, memory_params, memory_check in memory_monitors[duthost.hostname].commands:
+            output = memory_monitors[duthost.hostname].execute_command(cmd)
+            memory_values["before_test"][duthost.hostname][name] = memory_check(output, memory_params)
+
+    logging.info("Before test: collected memory_values {}".format(memory_values))
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_runtest_teardown(item, nextitem):
+    logging.info("collect memory after test {}".format(item.name))
+
+    duthosts = getattr(item.config, 'store_duthosts', None)
+    memory_utilization = getattr(item.config, 'store_memory_utilization', None)
+    if duthosts is None and memory_utilization is None:
+        return
+
+    memory_monitors, memory_values = memory_utilization
+
+    logging.debug("memory_values {} ".format(memory_values))
+
+    for duthost in duthosts:
+        if duthost.topo_type == 't2':
+            continue
+
+        # memory check for all registered commands
+        for name, cmd, memory_params, memory_check in memory_monitors[duthost.hostname].commands:
+            output = memory_monitors[duthost.hostname].execute_command(cmd)
+            memory_values["after_test"][duthost.hostname][name] = memory_check(output, memory_params)
+
+        memory_monitors[duthost.hostname].check_memory_thresholds(
+            memory_values["after_test"][duthost.hostname], memory_values["before_test"][duthost.hostname])
+
+    logging.info("After test: collected memory_values {}".format(memory_values))
+
+
+@pytest.fixture(autouse=True)
+def memory_utilization(duthosts, request):
+    if request.config.getoption("--disable_memory_utilization") or "disable_memory_utilization" in request.keywords:
+        logging.info("Memory utilization is disabled")
+        yield None, None
+        return
+
+    memory_monitors = {}
+    memory_values = {"before_test": {}, "after_test": {}}
+
+    for duthost in duthosts:
+        if duthost.topo_type == 't2':
+            continue
+        memory_monitor = MemoryMonitor(ansible_host=duthost)
+        memory_values["before_test"][duthost.hostname] = {}
+        memory_values["after_test"][duthost.hostname] = {}
+        logging.info("Hostname: {}, Hwsku: {}, Platform: {}".format(
+            duthost.hostname, duthost.sonichost._facts["hwsku"], duthost.sonichost._facts["platform"]))
+        memory_monitor.parse_and_register_commands(hwsku=duthost.sonichost._facts["hwsku"])
+        memory_monitors[duthost.hostname] = memory_monitor
+
+    yield memory_monitors, memory_values
diff --git a/tests/common/plugins/memory_utilization/memory_utilization.py b/tests/common/plugins/memory_utilization/memory_utilization.py
@@ -0,0 +1,253 @@
+import logging
+import re
+import json
+from os.path import join, split
+import pytest
+
+logger = logging.getLogger(__name__)
+
+MEMORY_UTILIZATION_COMMON_JSON_FILE = join(split(__file__)[0], "memory_utilization_common.json")
+MEMORY_UTILIZATION_DEPENDENCE_JSON_FILE = join(split(__file__)[0], "memory_utilization_dependence.json")
+
+
+class MemoryMonitor:
+    def __init__(self, ansible_host):
+        self.ansible_host = ansible_host
+        self.commands = []
+        self.memory_values = {}
+
+    def register_command(self, name, cmd, memory_params, memory_check_fn):
+        """Register a command with its associated memory parameters and check function."""
+        self.commands.append((name, cmd, memory_params, memory_check_fn))
+        self.memory_values[name] = {}
+
+    def execute_command(self, cmd):
+        """Execute a shell command and return its output."""
+        response = self.ansible_host.command(cmd, module_ignore_errors=True)
+        stdout = response.get('stdout', None)
+        # logger.debug("Command '{}' response: {}".format(cmd, stdout))
+        return stdout
+
+    def check_memory_thresholds(self, current_values, previous_values):
+        """Check memory usage against thresholds. """
+        logger.debug("Previous values: {}".format(previous_values))
+        logger.debug("Current values: {}".format(current_values))
+
+        for name, cmd, memory_params, memory_check_fn in self.commands:
+            for mem_item, thresholds in memory_params.items():
+                current_value = float(current_values.get(name, {}).get(mem_item, 0))
+                previous_value = float(previous_values.get(name, {}).get(mem_item, 0))
+
+                if current_value == 0 or previous_value == 0:
+                    logger.warning("Skipping memory check for {}-{} due to zero value".format(name, mem_item))
+                    continue
+
+                high_threshold = float(thresholds.get("memory_high_threshold", float('inf')))
+                increase_threshold = float(thresholds.get("memory_increase_threshold", float('inf')))
+
+                if previous_value > high_threshold:
+                    self.handle_memory_threshold_exceeded(
+                        name, mem_item, previous_value, high_threshold,
+                        previous_values, current_values, is_current=False
+                    )
+
+                if current_value > high_threshold:
+                    self.handle_memory_threshold_exceeded(
+                        name, mem_item, current_value, high_threshold,
+                        previous_values, current_values, is_current=True
+                    )
+
+                increase = current_value - previous_value
+                if increase > increase_threshold:
+                    self.handle_memory_threshold_exceeded(
+                        name, mem_item, increase, increase_threshold,
+                        previous_values, current_values, is_increase=True
+                    )
+
+    def handle_memory_threshold_exceeded(self, name, mem_item, value, threshold,
+                                         previous_values, current_values, is_current=False, is_increase=False):
+
+        """Handle memory threshold or increase exceeded."""
+        logger.info("{}:{}, previous_values: {}".format(name, mem_item, previous_values))
+        logger.info("{}:{}, current_values: {}".format(name, mem_item, current_values))
+
+        if is_increase:
+            message = (
+                "[ALARM]: {}:{} memory usage increased by {}, "
+                "exceeds increase threshold {}".format(
+                    name, mem_item, value, threshold
+                )
+            )
+        else:
+            message = (
+                "[ALARM]: {}:{}, {} memory usage {} exceeds "
+                "high threshold {}".format(
+                    name, mem_item, "Current" if is_current else "Previous", value, threshold
+                )
+            )
+
+        logger.warning(message)
+        pytest.fail(message)
+
+    def parse_and_register_commands(self, hwsku=None):
+        """Initialize the MemoryMonitor by reading commands from JSON files and registering them."""
+
+        parameter_dict = {}
+        with open(MEMORY_UTILIZATION_COMMON_JSON_FILE, 'r') as file:
+            data = json.load(file)
+            memory_items = data.get("COMMON", [])
+            for item in memory_items:
+                name = item["name"]
+                command = item["cmd"]
+                memory_params = item["memory_params"]
+                memory_check_fn = item["memory_check"]
+                parameter_dict[name] = {
+                    'name': name,
+                    'cmd': command,
+                    'memory_params': memory_params,
+                    'memory_check_fn': memory_check_fn
+                }
+
+        with open(MEMORY_UTILIZATION_DEPENDENCE_JSON_FILE, 'r') as file:
+            data = json.load(file)
+            memory_items = data.get("COMMON", [])
+            for item in memory_items:
+                name = item["name"]
+                command = item["cmd"]
+                memory_params = item["memory_params"]
+                memory_check_fn = item["memory_check"]
+                parameter_dict[name] = {
+                    'name': name,
+                    'cmd': command,
+                    'memory_params': memory_params,
+                    'memory_check_fn': memory_check_fn
+                }
+
+            if hwsku:
+                hwsku_found = any(hwsku in sku_list for sku_list in data.get("HWSKU", {}).values())
+                if hwsku_found:
+                    for key, value in data["HWSKU"].items():
+                        if hwsku in value:
+                            for item in data[key]:
+                                logger.info("#### CMD {} ".format(item))
+                                name = item["name"]
+                                command = item["cmd"]
+                                memory_params = item["memory_params"]
+                                memory_check_fn = item["memory_check"]
+                                parameter_dict[name] = {
+                                    'name': name,
+                                    'cmd': command,
+                                    'memory_params': memory_params,
+                                    'memory_check_fn': memory_check_fn
+                                }
+
+        for param in parameter_dict.values():
+            logger.debug(
+                "Registering command: name={}, cmd={}, memory_params={}, "
+                "memory_check={}".format(
+                    param['name'], param['cmd'], param['memory_params'], param['memory_check_fn']
+                )
+            )
+            self.register_command(param['name'], param['cmd'], param['memory_params'], eval(param['memory_check_fn']))
+
+
+def parse_top_output(output, memory_params):
+    """Parse the 'top' command output to extract memory usage information."""
+    memory_values = {}
+    headers = []
+    length = 0
+    for line in output.split('\n'):
+        if "PID" in line and "USER" in line and "RES" in line and "COMMAND" in line:
+            headers = line.split()
+            length = len(headers)
+            continue
+
+        parts = line.split()
+        if length != 0 and len(parts) == length:
+            process_info = {headers[i]: parts[i] for i in range(length)}
+
+            for mem_item, thresholds in memory_params.items():
+                if mem_item in process_info["COMMAND"]:
+                    if mem_item in memory_values:
+                        memory_values[mem_item] += int(process_info["RES"])
+                    else:
+                        memory_values[mem_item] = int(process_info["RES"])
+
+    logger.debug("Parsed memory values: {}".format(memory_values))
+    return memory_values
+
+
+def parse_free_output(output, memory_params):
+    """Parse the 'free' command output to extract memory usage information."""
+    memory_values = {}
+    headers, Mem, Swap = [], [], []
+    for line in output.split('\n'):
+        if "total" in line:
+            headers = line.split()
+        if "Mem:" in line:
+            Mem = line.split()[1:]
+        if "Swap:" in line:
+            Swap = line.split()[1:]
+
+    mem_info = {headers[i]: int(Mem[i]) for i in range(len(Mem))}
+    swap_info = {headers[i]: int(Swap[i]) for i in range(len(Swap))}
+
+    for mem_item, _ in memory_params.items():
+        memory_values[mem_item] = mem_info.get(mem_item, 0) + swap_info.get(mem_item, 0)
+
+    logger.debug("Parsed memory values: {}".format(memory_values))
+    return memory_values
+
+
+def parse_monit_status_output(output, memory_params):
+    """Parse the 'monit status' command output to extract memory usage information."""
+    memory_values = {}
+    memory_pattern = r"memory usage\s+([\d\.]+ \w+)\s+\[(\d+\.\d+)%\]"
+    swap_pattern = r"swap usage\s+([\d\.]+ \w+)\s+\[(\d+\.\d+)%\]"
+
+    for line in output.split('\n'):
+        if "memory usage" in line:
+            match = re.search(memory_pattern, line)
+            if match:
+                used_memory = match.group(1)         # noqa F841
+                memory_percentage = match.group(2)
+                memory_values['memory_usage'] = float(memory_percentage)
+            else:
+                logger.error("Failed to parse memory usage from line: {}".format(line))
+        if "swap usage" in line:
+            match = re.search(swap_pattern, line)
+            if match:
+                used_swap = match.group(1)            # noqa F841
+                swap_percentage = match.group(2)      # noqa F841
+            else:
+                logger.debug("Failed to parse swap usage from line: {}".format(line))
+
+    logger.debug("Parsed memory values: {}".format(memory_values))
+    return memory_values
+
+
+def parse_docker_stats_output(output, memory_params):
+    memory_values = {}
+    length = 0
+    pattern = r"(\d+\.\d+)%.*?(\d+\.\d+)%"
+
+    for line in output.split('\n'):
+        if "NAME" in line and "CPU" in line and "MEM" in line:
+            headers = line.split()
+            length = len(headers)
+            continue
+
+        if length != 0:
+            for mem_item, thresholds in memory_params.items():
+                if mem_item in line:
+                    match = re.search(pattern, line)
+                    if match:
+                        mem_usage = match.group(2)
+                        memory_values[mem_item] = mem_usage
+                    else:
+                        logger.error("Failed to parse memory usage from line: {}".format(line))
+                else:
+                    continue
+
+    logger.debug("Parsed memory values: {}".format(memory_values))
+    return memory_values
diff --git a/tests/common/plugins/memory_utilization/memory_utilization_common.json b/tests/common/plugins/memory_utilization/memory_utilization_common.json
@@ -0,0 +1,15 @@
+{
+  "COMMON": [
+    {
+      "name": "monit",
+      "cmd": "sudo monit status",
+      "memory_params": {
+        "memory_usage": {
+          "memory_increase_threshold": 50,
+          "memory_high_threshold": 90
+        }
+      },
+      "memory_check": "parse_monit_status_output"
+    }
+  ]
+}
diff --git a/tests/common/plugins/memory_utilization/memory_utilization_dependence.json b/tests/common/plugins/memory_utilization/memory_utilization_dependence.json
@@ -0,0 +1 @@
+{}
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -84,7 +84,8 @@
                   'tests.platform_tests.api',
                   'tests.common.plugins.allure_server',
                   'tests.common.plugins.conditional_mark',
-                  'tests.common.plugins.random_seed')
+                  'tests.common.plugins.random_seed',
+                  'tests.common.plugins.memory_utilization')
 
 
 def pytest_addoption(parser):