Skip to content

Commit

Permalink
implement memory utilization fixture (sonic-net#13698)
Browse files Browse the repository at this point in the history
What is the motivation for this PR?
Test gap for memory leak

How did you do it?
Introduced a new fixture to collect the memory information before and after the test case.
Then compare the memory information to confirm that it has not exceeded the high memory usage threshold and that no memory leaks have occurred.

How did you verify/test it?
Run case locally.
Set a fake threshold, check the result.
Run nightly pipeline.
  • Loading branch information
lipxu authored and arista-hpandya committed Oct 2, 2024
1 parent 3145843 commit 0fe4eb7
Show file tree
Hide file tree
Showing 6 changed files with 1,089 additions and 1 deletion.
722 changes: 722 additions & 0 deletions tests/common/plugins/memory_utilization/README.md

Large diffs are not rendered by default.

96 changes: 96 additions & 0 deletions tests/common/plugins/memory_utilization/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import logging
import pytest
from tests.common.plugins.memory_utilization.memory_utilization import MemoryMonitor


def pytest_addoption(parser):
parser.addoption(
"--disable_memory_utilization",
action="store_true",
default=False,
help="Disable memory utilization analysis for the 'memory_utilization' fixture"
)


@pytest.fixture(scope="function", autouse=True)
def store_fixture_values(request, duthosts, memory_utilization):
logging.info("store memory_utilization {}".format(request.node.name))
request.config.store_duthosts = duthosts
request.config.store_memory_utilization = memory_utilization


@pytest.hookimpl(trylast=True)
def pytest_runtest_setup(item):
logging.info("collect memory before test {}".format(item.name))

duthosts = getattr(item.config, 'store_duthosts', None)
memory_utilization = getattr(item.config, 'store_memory_utilization', None)
if duthosts is None and memory_utilization is None:
return

memory_monitors, memory_values = memory_utilization

logging.debug("memory_values {} ".format(memory_values))

for duthost in duthosts:
if duthost.topo_type == 't2':
continue

# Initial memory check for all registered commands
for name, cmd, memory_params, memory_check in memory_monitors[duthost.hostname].commands:
output = memory_monitors[duthost.hostname].execute_command(cmd)
memory_values["before_test"][duthost.hostname][name] = memory_check(output, memory_params)

logging.info("Before test: collected memory_values {}".format(memory_values))


@pytest.hookimpl(tryfirst=True)
def pytest_runtest_teardown(item, nextitem):
logging.info("collect memory after test {}".format(item.name))

duthosts = getattr(item.config, 'store_duthosts', None)
memory_utilization = getattr(item.config, 'store_memory_utilization', None)
if duthosts is None and memory_utilization is None:
return

memory_monitors, memory_values = memory_utilization

logging.debug("memory_values {} ".format(memory_values))

for duthost in duthosts:
if duthost.topo_type == 't2':
continue

# memory check for all registered commands
for name, cmd, memory_params, memory_check in memory_monitors[duthost.hostname].commands:
output = memory_monitors[duthost.hostname].execute_command(cmd)
memory_values["after_test"][duthost.hostname][name] = memory_check(output, memory_params)

memory_monitors[duthost.hostname].check_memory_thresholds(
memory_values["after_test"][duthost.hostname], memory_values["before_test"][duthost.hostname])

logging.info("After test: collected memory_values {}".format(memory_values))


@pytest.fixture(autouse=True)
def memory_utilization(duthosts, request):
if request.config.getoption("--disable_memory_utilization") or "disable_memory_utilization" in request.keywords:
logging.info("Memory utilization is disabled")
yield None, None
return

memory_monitors = {}
memory_values = {"before_test": {}, "after_test": {}}

for duthost in duthosts:
if duthost.topo_type == 't2':
continue
memory_monitor = MemoryMonitor(ansible_host=duthost)
memory_values["before_test"][duthost.hostname] = {}
memory_values["after_test"][duthost.hostname] = {}
logging.info("Hostname: {}, Hwsku: {}, Platform: {}".format(
duthost.hostname, duthost.sonichost._facts["hwsku"], duthost.sonichost._facts["platform"]))
memory_monitor.parse_and_register_commands(hwsku=duthost.sonichost._facts["hwsku"])
memory_monitors[duthost.hostname] = memory_monitor

yield memory_monitors, memory_values
253 changes: 253 additions & 0 deletions tests/common/plugins/memory_utilization/memory_utilization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
import logging
import re
import json
from os.path import join, split
import pytest

logger = logging.getLogger(__name__)

MEMORY_UTILIZATION_COMMON_JSON_FILE = join(split(__file__)[0], "memory_utilization_common.json")
MEMORY_UTILIZATION_DEPENDENCE_JSON_FILE = join(split(__file__)[0], "memory_utilization_dependence.json")


class MemoryMonitor:
def __init__(self, ansible_host):
self.ansible_host = ansible_host
self.commands = []
self.memory_values = {}

def register_command(self, name, cmd, memory_params, memory_check_fn):
"""Register a command with its associated memory parameters and check function."""
self.commands.append((name, cmd, memory_params, memory_check_fn))
self.memory_values[name] = {}

def execute_command(self, cmd):
"""Execute a shell command and return its output."""
response = self.ansible_host.command(cmd, module_ignore_errors=True)
stdout = response.get('stdout', None)
# logger.debug("Command '{}' response: {}".format(cmd, stdout))
return stdout

def check_memory_thresholds(self, current_values, previous_values):
"""Check memory usage against thresholds. """
logger.debug("Previous values: {}".format(previous_values))
logger.debug("Current values: {}".format(current_values))

for name, cmd, memory_params, memory_check_fn in self.commands:
for mem_item, thresholds in memory_params.items():
current_value = float(current_values.get(name, {}).get(mem_item, 0))
previous_value = float(previous_values.get(name, {}).get(mem_item, 0))

if current_value == 0 or previous_value == 0:
logger.warning("Skipping memory check for {}-{} due to zero value".format(name, mem_item))
continue

high_threshold = float(thresholds.get("memory_high_threshold", float('inf')))
increase_threshold = float(thresholds.get("memory_increase_threshold", float('inf')))

if previous_value > high_threshold:
self.handle_memory_threshold_exceeded(
name, mem_item, previous_value, high_threshold,
previous_values, current_values, is_current=False
)

if current_value > high_threshold:
self.handle_memory_threshold_exceeded(
name, mem_item, current_value, high_threshold,
previous_values, current_values, is_current=True
)

increase = current_value - previous_value
if increase > increase_threshold:
self.handle_memory_threshold_exceeded(
name, mem_item, increase, increase_threshold,
previous_values, current_values, is_increase=True
)

def handle_memory_threshold_exceeded(self, name, mem_item, value, threshold,
previous_values, current_values, is_current=False, is_increase=False):

"""Handle memory threshold or increase exceeded."""
logger.info("{}:{}, previous_values: {}".format(name, mem_item, previous_values))
logger.info("{}:{}, current_values: {}".format(name, mem_item, current_values))

if is_increase:
message = (
"[ALARM]: {}:{} memory usage increased by {}, "
"exceeds increase threshold {}".format(
name, mem_item, value, threshold
)
)
else:
message = (
"[ALARM]: {}:{}, {} memory usage {} exceeds "
"high threshold {}".format(
name, mem_item, "Current" if is_current else "Previous", value, threshold
)
)

logger.warning(message)
pytest.fail(message)

def parse_and_register_commands(self, hwsku=None):
"""Initialize the MemoryMonitor by reading commands from JSON files and registering them."""

parameter_dict = {}
with open(MEMORY_UTILIZATION_COMMON_JSON_FILE, 'r') as file:
data = json.load(file)
memory_items = data.get("COMMON", [])
for item in memory_items:
name = item["name"]
command = item["cmd"]
memory_params = item["memory_params"]
memory_check_fn = item["memory_check"]
parameter_dict[name] = {
'name': name,
'cmd': command,
'memory_params': memory_params,
'memory_check_fn': memory_check_fn
}

with open(MEMORY_UTILIZATION_DEPENDENCE_JSON_FILE, 'r') as file:
data = json.load(file)
memory_items = data.get("COMMON", [])
for item in memory_items:
name = item["name"]
command = item["cmd"]
memory_params = item["memory_params"]
memory_check_fn = item["memory_check"]
parameter_dict[name] = {
'name': name,
'cmd': command,
'memory_params': memory_params,
'memory_check_fn': memory_check_fn
}

if hwsku:
hwsku_found = any(hwsku in sku_list for sku_list in data.get("HWSKU", {}).values())
if hwsku_found:
for key, value in data["HWSKU"].items():
if hwsku in value:
for item in data[key]:
logger.info("#### CMD {} ".format(item))
name = item["name"]
command = item["cmd"]
memory_params = item["memory_params"]
memory_check_fn = item["memory_check"]
parameter_dict[name] = {
'name': name,
'cmd': command,
'memory_params': memory_params,
'memory_check_fn': memory_check_fn
}

for param in parameter_dict.values():
logger.debug(
"Registering command: name={}, cmd={}, memory_params={}, "
"memory_check={}".format(
param['name'], param['cmd'], param['memory_params'], param['memory_check_fn']
)
)
self.register_command(param['name'], param['cmd'], param['memory_params'], eval(param['memory_check_fn']))


def parse_top_output(output, memory_params):
"""Parse the 'top' command output to extract memory usage information."""
memory_values = {}
headers = []
length = 0
for line in output.split('\n'):
if "PID" in line and "USER" in line and "RES" in line and "COMMAND" in line:
headers = line.split()
length = len(headers)
continue

parts = line.split()
if length != 0 and len(parts) == length:
process_info = {headers[i]: parts[i] for i in range(length)}

for mem_item, thresholds in memory_params.items():
if mem_item in process_info["COMMAND"]:
if mem_item in memory_values:
memory_values[mem_item] += int(process_info["RES"])
else:
memory_values[mem_item] = int(process_info["RES"])

logger.debug("Parsed memory values: {}".format(memory_values))
return memory_values


def parse_free_output(output, memory_params):
"""Parse the 'free' command output to extract memory usage information."""
memory_values = {}
headers, Mem, Swap = [], [], []
for line in output.split('\n'):
if "total" in line:
headers = line.split()
if "Mem:" in line:
Mem = line.split()[1:]
if "Swap:" in line:
Swap = line.split()[1:]

mem_info = {headers[i]: int(Mem[i]) for i in range(len(Mem))}
swap_info = {headers[i]: int(Swap[i]) for i in range(len(Swap))}

for mem_item, _ in memory_params.items():
memory_values[mem_item] = mem_info.get(mem_item, 0) + swap_info.get(mem_item, 0)

logger.debug("Parsed memory values: {}".format(memory_values))
return memory_values


def parse_monit_status_output(output, memory_params):
"""Parse the 'monit status' command output to extract memory usage information."""
memory_values = {}
memory_pattern = r"memory usage\s+([\d\.]+ \w+)\s+\[(\d+\.\d+)%\]"
swap_pattern = r"swap usage\s+([\d\.]+ \w+)\s+\[(\d+\.\d+)%\]"

for line in output.split('\n'):
if "memory usage" in line:
match = re.search(memory_pattern, line)
if match:
used_memory = match.group(1) # noqa F841
memory_percentage = match.group(2)
memory_values['memory_usage'] = float(memory_percentage)
else:
logger.error("Failed to parse memory usage from line: {}".format(line))
if "swap usage" in line:
match = re.search(swap_pattern, line)
if match:
used_swap = match.group(1) # noqa F841
swap_percentage = match.group(2) # noqa F841
else:
logger.debug("Failed to parse swap usage from line: {}".format(line))

logger.debug("Parsed memory values: {}".format(memory_values))
return memory_values


def parse_docker_stats_output(output, memory_params):
memory_values = {}
length = 0
pattern = r"(\d+\.\d+)%.*?(\d+\.\d+)%"

for line in output.split('\n'):
if "NAME" in line and "CPU" in line and "MEM" in line:
headers = line.split()
length = len(headers)
continue

if length != 0:
for mem_item, thresholds in memory_params.items():
if mem_item in line:
match = re.search(pattern, line)
if match:
mem_usage = match.group(2)
memory_values[mem_item] = mem_usage
else:
logger.error("Failed to parse memory usage from line: {}".format(line))
else:
continue

logger.debug("Parsed memory values: {}".format(memory_values))
return memory_values
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"COMMON": [
{
"name": "monit",
"cmd": "sudo monit status",
"memory_params": {
"memory_usage": {
"memory_increase_threshold": 50,
"memory_high_threshold": 90
}
},
"memory_check": "parse_monit_status_output"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
3 changes: 2 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@
'tests.platform_tests.api',
'tests.common.plugins.allure_server',
'tests.common.plugins.conditional_mark',
'tests.common.plugins.random_seed')
'tests.common.plugins.random_seed',
'tests.common.plugins.memory_utilization')


def pytest_addoption(parser):
Expand Down

0 comments on commit 0fe4eb7

Please sign in to comment.