From cf7322a79f182781767d563cffa315562e7e68dc Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Fri, 30 Aug 2024 10:24:16 -0400 Subject: [PATCH] DAOS-16366 test: Use agent/server config files from test directory (#14944) (#15033) Use agent, control, and server config files placed in the common test directory instead of /etc/daos with a systemctl override configuration file. Signed-off-by: Phil Henderson --- src/tests/ftest/daos_test/dfuse.py | 11 +- src/tests/ftest/launch.py | 17 +- src/tests/ftest/pool/destroy.py | 7 +- src/tests/ftest/recovery/ddb.py | 17 +- src/tests/ftest/util/agent_utils.py | 6 +- src/tests/ftest/util/apricot/apricot/test.py | 21 +- src/tests/ftest/util/collection_utils.py | 109 +------- src/tests/ftest/util/command_utils.py | 49 ++-- src/tests/ftest/util/environment_utils.py | 168 ++++++------- src/tests/ftest/util/fault_config_utils.py | 13 +- src/tests/ftest/util/file_utils.py | 147 +++++++++++ src/tests/ftest/util/general_utils.py | 173 +------------ src/tests/ftest/util/launch_utils.py | 79 +++++- src/tests/ftest/util/server_utils.py | 3 +- src/tests/ftest/util/systemctl_utils.py | 249 +++++++++++++++++++ 15 files changed, 652 insertions(+), 417 deletions(-) create mode 100644 src/tests/ftest/util/file_utils.py create mode 100644 src/tests/ftest/util/systemctl_utils.py diff --git a/src/tests/ftest/daos_test/dfuse.py b/src/tests/ftest/daos_test/dfuse.py index 205d1cd11a6..e735e745cab 100644 --- a/src/tests/ftest/daos_test/dfuse.py +++ b/src/tests/ftest/daos_test/dfuse.py @@ -10,7 +10,8 @@ from apricot import TestWithServers from cmocka_utils import CmockaUtils from dfuse_utils import get_dfuse, start_dfuse -from general_utils import create_directory, get_log_file +from file_utils import create_directory +from general_utils import get_log_file from job_manager_utils import get_job_manager @@ -80,7 +81,9 @@ def run_test(self, il_lib=None): else: # Bypass, simply create a remote directory and use that. mount_dir = '/tmp/dfuse-test' - create_directory(self.hostlist_clients, mount_dir) + result = create_directory(self.log, self.hostlist_clients, mount_dir) + if not result.passed: + self.fail(f"Error creating {mount_dir} on {result.failed_hosts}") cmocka_utils = CmockaUtils( self.hostlist_clients, "dfuse", self.outputdir, self.test_dir, self.log) @@ -118,7 +121,9 @@ def run_test(self, il_lib=None): else: # make D_IL_MOUNT_POINT different from mount_dir so it tests a non-DAOS filesystem dummy_dir = '/tmp/dummy' - create_directory(self.hostlist_clients, dummy_dir) + result = create_directory(self.log, self.hostlist_clients, dummy_dir) + if not result.passed: + self.fail(f"Error creating {dummy_dir} on {result.failed_hosts}") daos_test_env['D_IL_MOUNT_POINT'] = dummy_dir if cache_mode != 'writeback': command.append('--metadata') diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index ad5b034301d..f3daa8464d7 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -21,7 +21,8 @@ from util.code_coverage_utils import CodeCoverage from util.environment_utils import TestEnvironment, TestEnvironmentException, set_test_environment from util.host_utils import get_local_host -from util.launch_utils import LaunchException, TestGroup, setup_fuse_config, summarize_run +from util.launch_utils import (LaunchException, TestGroup, setup_fuse_config, setup_systemctl, + summarize_run) from util.logger_utils import LOG_FILE_FORMAT, get_console_handler, get_file_handler from util.network_utils import PROVIDER_ALIAS, SUPPORTED_PROVIDERS from util.package_utils import find_packages @@ -270,7 +271,8 @@ def _run(self, args): # pylint: disable=unsupported-binary-operation all_hosts = args.test_servers | args.test_clients | self.local_host self.details["installed packages"] = find_packages( - logger, all_hosts, "'^(daos|libfabric|mercury|ior|openmpi|mpifileutils)-'") + logger, all_hosts, + "'^(daos|libfabric|mercury|ior|openmpi|mpifileutils|mlnx-ofed-basic)-'") # Setup the test environment test_env = TestEnvironment() @@ -325,6 +327,15 @@ def _run(self, args): message = "Issue detected setting up the fuse configuration" setup_result.warn_test(logger, "Setup", message, sys.exc_info()) + # Setup override systemctl files + try: + clients = args.test_clients if args.test_clients else args.test_servers + cleanup_files = setup_systemctl( + logger, args.test_servers, clients | self.local_host, test_env) + except LaunchException: + message = "Issue detected setting up the systemctl configuration" + return self.get_exit_status(1, message, "Setup", sys.exc_info()) + # Get the core file pattern information core_files = {} if args.process_cores: @@ -370,7 +381,7 @@ def _run(self, args): logger, self.result, self.repeat, self.slurm_setup, args.sparse, args.failfast, not args.disable_stop_daos, args.archive, args.rename, args.jenkinslog, core_files, args.logs_threshold, args.user_create, code_coverage, self.job_results_dir, - self.logdir, args.clear_mounts) + self.logdir, args.clear_mounts, cleanup_files) # Convert the test status to a launch.py status status |= summarize_run(logger, self.mode, test_status) diff --git a/src/tests/ftest/pool/destroy.py b/src/tests/ftest/pool/destroy.py index 26280a74978..2bcd5f6cea7 100644 --- a/src/tests/ftest/pool/destroy.py +++ b/src/tests/ftest/pool/destroy.py @@ -362,10 +362,11 @@ def test_destroy_wrong_group(self): server_group_b = self.server_group + "_b" # Prepare and configure dmg config files for a and b. - dmg_config_file_a = get_default_config_file(name="control_a") + config_path = os.path.dirname(self.test_env.control_config) + dmg_config_file_a = get_default_config_file(name="control_a", path=config_path) dmg_config_temp_a = self.get_config_file( name=server_group_a, command="dmg", path=self.test_dir) - dmg_config_file_b = get_default_config_file(name="control_b") + dmg_config_file_b = get_default_config_file(name="control_b", path=config_path) dmg_config_temp_b = self.get_config_file( name=server_group_b, command="dmg", path=self.test_dir) @@ -393,7 +394,7 @@ def test_destroy_wrong_group(self): # Get dmg_c instance that uses daos_control_c.yml. Server group is b. cert_dir = os.path.join(os.sep, "etc", "daos", "certs") - dmg_config_file_c = get_default_config_file(name="control_c") + dmg_config_file_c = get_default_config_file(name="control_c", path=config_path) dmg_config_temp_c = self.get_config_file( name=server_group_b, command="dmg", path=self.test_dir) dmg_c = get_dmg_command( diff --git a/src/tests/ftest/recovery/ddb.py b/src/tests/ftest/recovery/ddb.py index 8447db1873a..04df3184984 100644 --- a/src/tests/ftest/recovery/ddb.py +++ b/src/tests/ftest/recovery/ddb.py @@ -10,10 +10,12 @@ from ClusterShell.NodeSet import NodeSet from ddb_utils import DdbCommand from exception_utils import CommandFailure -from general_utils import (DaosTestError, create_string_buffer, distribute_files, - get_clush_command, get_random_string, report_errors, run_command) +from file_utils import distribute_files +from general_utils import (DaosTestError, create_string_buffer, get_random_string, report_errors, + run_command) from pydaos.raw import DaosObjClass, IORequest from recovery_test_base import RecoveryTestBase +from run_utils import get_clush_command def insert_objects(context, container, object_count, dkey_count, akey_count, base_dkey, @@ -507,14 +509,9 @@ def test_recovery_ddb_load(self): file.write(new_data) # Copy the created file to server node. - try: - distribute_files( - hosts=host, source=load_file_path, destination=load_file_path, - mkdir=False) - except DaosTestError as error: - raise CommandFailure( - "ERROR: Copying new_data.txt to {0}: {1}".format(host, error)) \ - from error + result = distribute_files(self.log, host, load_file_path, load_file_path, False) + if not result.passed: + raise CommandFailure(f"ERROR: Copying new_data.txt to {result.failed_hosts}") # The file with the new data is ready. Run ddb load. ddb_command.value_load(component_path="[0]/[0]/[0]/[0]", load_file_path=load_file_path) diff --git a/src/tests/ftest/util/agent_utils.py b/src/tests/ftest/util/agent_utils.py index ff815356563..74b79fb9796 100644 --- a/src/tests/ftest/util/agent_utils.py +++ b/src/tests/ftest/util/agent_utils.py @@ -6,7 +6,6 @@ import os import re import socket -from getpass import getuser from agent_utils_params import DaosAgentTransportCredentials, DaosAgentYamlParameters from ClusterShell.NodeSet import NodeSet @@ -289,7 +288,7 @@ def start(self): # Verify the socket directory exists when using a non-systemctl manager if self.verify_socket_dir: - self.verify_socket_directory(getuser()) + self.verify_socket_directory(self.manager.job.certificate_owner) super().start() @@ -319,7 +318,8 @@ def support_collect_log(self, **kwargs): """ cmd = self.manager.job.copy() cmd.debug.value = False - cmd.config.value = get_default_config_file("agent") + cmd.config.value = get_default_config_file( + "agent", os.path.dirname(self.manager.job.yaml.filename)) cmd.set_command(("support", "collect-log"), **kwargs) self.log.info("Support collect-log on clients: %s", str(cmd)) return run_remote(self.log, self.hosts, cmd.with_exports) diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index 7a5542c15af..42e05937f37 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -26,8 +26,7 @@ from exception_utils import CommandFailure from fault_config_utils import FaultInjection from general_utils import (dict_to_str, dump_engines_stacks, get_avocado_config_value, - get_default_config_file, get_file_listing, nodeset_append_suffix, - set_avocado_config_value) + nodeset_append_suffix, set_avocado_config_value) from host_utils import HostException, HostInfo, HostRole, get_host_parameters, get_local_host from logger_utils import TestLogger from pydaos.raw import DaosApiError, DaosContext, DaosLog @@ -762,13 +761,17 @@ def setUp(self): self.fail(f"Error creating test-specific temporary directory on {result.failed_hosts}") # Copy the fault injection files to the hosts. - self.fault_injection.copy_fault_files(self.host_info.all_hosts) + self.fault_injection.copy_fault_files(self.log, self.host_info.all_hosts) # List common test directory contents before running the test self.log.info("-" * 100) - self.log.debug("Common test directory (%s) contents:", self.test_dir) + self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir)) all_hosts = include_local_host(self.host_info.all_hosts) - get_file_listing(all_hosts, self.test_dir, self.test_env.agent_user).log_output(self.log) + test_dir_parent = os.path.dirname(self.test_dir) + result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}") + if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90: + run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*") + self.log.info("-" * 100) if not self.start_servers_once or self.name.uid == 1: # Kill commands left running on the hosts (from a previous test) @@ -1063,7 +1066,7 @@ def add_agent_manager(self, group=None, config_file=None, config_temp=None): if group is None: group = self.server_group if config_file is None and self.agent_manager_class == "Systemctl": - config_file = get_default_config_file("agent") + config_file = self.test_env.agent_config config_temp = self.get_config_file(group, "agent", self.test_dir) elif config_file is None: config_file = self.get_config_file(group, "agent") @@ -1113,14 +1116,14 @@ def add_server_manager(self, group=None, svr_config_file=None, if group is None: group = self.server_group if svr_config_file is None and self.server_manager_class == "Systemctl": - svr_config_file = get_default_config_file("server") + svr_config_file = self.test_env.server_config svr_config_temp = self.get_config_file( group, "server", self.test_dir) elif svr_config_file is None: svr_config_file = self.get_config_file(group, "server") svr_config_temp = None if dmg_config_file is None and self.server_manager_class == "Systemctl": - dmg_config_file = get_default_config_file("control") + dmg_config_file = self.test_env.control_config dmg_config_temp = self.get_config_file(group, "dmg", self.test_dir) elif dmg_config_file is None: dmg_config_file = self.get_config_file(group, "dmg") @@ -1668,7 +1671,7 @@ def get_dmg_command(self, index=0): return self.server_managers[index].dmg if self.server_manager_class == "Systemctl": - dmg_config_file = get_default_config_file("control") + dmg_config_file = self.test_env.control_config dmg_config_temp = self.get_config_file("daos", "dmg", self.test_dir) dmg_cert_dir = os.path.join(os.sep, "etc", "daos", "certs") else: diff --git a/src/tests/ftest/util/collection_utils.py b/src/tests/ftest/util/collection_utils.py index 3b054eb19e9..a900769f4d6 100644 --- a/src/tests/ftest/util/collection_utils.py +++ b/src/tests/ftest/util/collection_utils.py @@ -17,6 +17,7 @@ from util.environment_utils import TestEnvironment from util.host_utils import get_local_host from util.run_utils import find_command, run_local, run_remote, stop_processes +from util.systemctl_utils import stop_service from util.user_utils import get_chown_command from util.yaml_utils import get_test_category @@ -66,97 +67,6 @@ def stop_daos_server_service(logger, test): return stop_service(logger, hosts, service) -def stop_service(logger, hosts, service): - """Stop any daos_server.service running on the hosts running servers. - - Args: - logger (Logger): logger for the messages produced by this method - hosts (NodeSet): list of hosts on which to stop the service. - service (str): name of the service - - Returns: - bool: True if the service was successfully stopped; False otherwise - - """ - result = {"status": True} - if hosts: - status_keys = ["reset-failed", "stop", "disable"] - mapping = {"stop": "active", "disable": "enabled", "reset-failed": "failed"} - check_hosts = NodeSet(hosts) - loop = 1 - # Reduce 'max_loops' to 2 once https://jira.hpdd.intel.com/browse/DAOS-7809 - # has been resolved - max_loops = 3 - while check_hosts: - # Check the status of the service on each host - result = get_service_status(logger, check_hosts, service) - check_hosts = NodeSet() - for key in status_keys: - if result[key]: - if loop == max_loops: - # Exit the while loop if the service is still running - logger.error( - " - Error %s still %s on %s", service, mapping[key], result[key]) - result["status"] = False - else: - # Issue the appropriate systemctl command to remedy the - # detected state, e.g. 'stop' for 'active'. - command = ["sudo", "-n", "systemctl", key, service] - run_remote(logger, result[key], " ".join(command)) - - # Run the status check again on this group of hosts - check_hosts.add(result[key]) - loop += 1 - else: - logger.debug(" Skipping stopping %s service - no hosts", service) - - return result["status"] - - -def get_service_status(logger, hosts, service): - """Get the status of the daos_server.service. - - Args: - logger (Logger): logger for the messages produced by this method - hosts (NodeSet): hosts on which to get the service state - service (str): name of the service - - Returns: - dict: a dictionary with the following keys: - - "status": boolean set to True if status was obtained; False otherwise - - "stop": NodeSet where to stop the daos_server.service - - "disable": NodeSet where to disable the daos_server.service - - "reset-failed": NodeSet where to reset the daos_server.service - - """ - status = { - "status": True, - "stop": NodeSet(), - "disable": NodeSet(), - "reset-failed": NodeSet()} - status_states = { - "stop": ["active", "activating", "deactivating"], - "disable": ["active", "activating", "deactivating"], - "reset-failed": ["failed"]} - command = ["systemctl", "is-active", service] - result = run_remote(logger, hosts, " ".join(command)) - for data in result.output: - if data.timeout: - status["status"] = False - status["stop"].add(data.hosts) - status["disable"].add(data.hosts) - status["reset-failed"].add(data.hosts) - logger.debug(" %s: TIMEOUT", data.hosts) - break - logger.debug(" %s: %s", data.hosts, "\n".join(data.stdout)) - for key, state_list in status_states.items(): - for line in data.stdout: - if line in state_list: - status[key].add(data.hosts) - break - return status - - def reset_server_storage(logger, test): """Reset the server storage for the hosts that ran servers in the test. @@ -981,14 +891,15 @@ def collect_test_result(logger, test, test_result, job_results_dir, stop_daos, a "depth": 1, "timeout": 300, } - remote_files["remote configuration files"] = { - "source": os.path.join(os.sep, "etc", "daos"), - "destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[0]), - "pattern": "daos_*.yml", - "hosts": test.host_info.all_hosts, - "depth": 1, - "timeout": 300, - } + for index, source in enumerate(test_env.config_file_directories()): + remote_files[f"remote configuration files ({index})"] = { + "source": source, + "destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[0]), + "pattern": "daos_*.yml", + "hosts": test.host_info.all_hosts, + "depth": 1, + "timeout": 300, + } remote_files["daos log files"] = { "source": test_env.log_dir, "destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[1]), diff --git a/src/tests/ftest/util/command_utils.py b/src/tests/ftest/util/command_utils.py index b22300974fb..06da3b27868 100644 --- a/src/tests/ftest/util/command_utils.py +++ b/src/tests/ftest/util/command_utils.py @@ -19,9 +19,9 @@ from command_utils_base import (BasicParameter, CommandWithParameters, EnvironmentVariables, FormattedParameter, LogParameter, ObjectWithParameters) from exception_utils import CommandFailure -from general_utils import (DaosTestError, change_file_owner, check_file_exists, create_directory, - distribute_files, get_file_listing, get_job_manager_class, - get_subprocess_stdout, run_command) +from file_utils import change_file_owner, create_directory, distribute_files +from general_utils import (DaosTestError, check_file_exists, get_file_listing, + get_job_manager_class, get_subprocess_stdout, run_command) from run_utils import command_as_user, run_remote from user_utils import get_primary_group from yaml_utils import get_yaml_data @@ -1017,19 +1017,17 @@ def copy_certificates(self, source, hosts): self.log.debug("Copying certificates for %s:", self._command) data = yaml.get_certificate_data(yaml.get_attribute_names(LogParameter)) for name in data: - create_directory(hosts, name, verbose=True, raise_exception=False) + create_directory(self.log, hosts, name, verbose=True) for file_name in data[name]: src_file = os.path.join(source, file_name) dst_file = os.path.join(name, file_name) self.log.debug(" %s -> %s", src_file, dst_file) result = distribute_files( - hosts, src_file, dst_file, mkdir=False, - verbose=False, raise_exception=False, sudo=True, - owner=self.certificate_owner) - if result.exit_status != 0: + self.log, hosts, src_file, dst_file, mkdir=False, + verbose=False, sudo=True, owner=self.certificate_owner) + if not result.passed: self.log.info( - " WARNING: %s copy failed on %s:\n%s", - dst_file, hosts, result) + " WARNING: %s copy failed on %s", dst_file, result.failed_hosts) names.add(name) yaml = yaml.other_params @@ -1051,21 +1049,18 @@ def copy_configuration(self, hosts): Raises: CommandFailure: if there is an error copying the configuration file - """ if self.yaml is not None and hasattr(self.yaml, "filename"): if self.temporary_file and hosts: self.log.info( "Copying %s yaml configuration file to %s on %s", self.temporary_file, self.yaml.filename, hosts) - try: - distribute_files( - hosts, self.temporary_file, self.yaml.filename, - verbose=False, sudo=True) - except DaosTestError as error: + result = distribute_files( + self.log, hosts, self.temporary_file, self.yaml.filename, verbose=False, + sudo=True) + if not result.passed: raise CommandFailure( - "ERROR: Copying yaml configuration file to {}: " - "{}".format(hosts, error)) from error + f"ERROR: Copying yaml configuration file to {result.failed_hosts}") def verify_socket_directory(self, user, hosts): """Verify the domain socket directory is present and owned by this user. @@ -1088,15 +1083,17 @@ def verify_socket_directory(self, user, hosts): self.log.info( "%s: creating socket directory %s for user %s on %s", self.command, directory, user, nodes) - try: - create_directory(nodes, directory, sudo=True) - change_file_owner(nodes, directory, user, get_primary_group(user), sudo=True) - except DaosTestError as error: + result = create_directory(self.log, nodes, directory, user="root") + if not result.passed: + raise CommandFailure( + f"{self.command}: error creating socket directory {directory} for user " + f"{user} on {result.failed_hosts}") + result = change_file_owner( + self.log, nodes, directory, user, get_primary_group(user), user="root") + if not result.passed: raise CommandFailure( - "{}: error setting up missing socket directory {} for " - "user {} on {}:\n{}".format( - self.command, directory, user, nodes, - error)) from error + f"{self.command}: error setting socket directory {directory} owner for " + f"user {user} on {result.failed_hosts}") def get_socket_dir(self): """Get the socket directory. diff --git a/src/tests/ftest/util/environment_utils.py b/src/tests/ftest/util/environment_utils.py index 1e295be3815..e36d750500e 100644 --- a/src/tests/ftest/util/environment_utils.py +++ b/src/tests/ftest/util/environment_utils.py @@ -9,6 +9,7 @@ from ClusterShell.NodeSet import NodeSet # pylint: disable=import-error,no-name-in-module +from util.host_utils import get_local_host from util.network_utils import (PROVIDER_ALIAS, SUPPORTED_PROVIDERS, NetworkException, get_common_provider, get_fastest_interface) from util.run_utils import run_remote @@ -109,6 +110,9 @@ class TestEnvironment(): 'daos_prefix': 'DAOS_TEST_PREFIX', 'agent_user': 'DAOS_TEST_AGENT_USER', 'systemd_library_path': 'DAOS_TEST_SYSTEMD_LIBRARY_PATH', + 'control_config': 'DAOS_TEST_CONTROL_CONFIG', + 'agent_config': 'DAOS_TEST_AGENT_CONFIG', + 'server_config': 'DAOS_TEST_SERVER_CONFIG', } def __init__(self): @@ -155,29 +159,34 @@ def set_defaults(self, logger, servers=None, clients=None, provider=None, insecu # Set defaults for any unset values if self.log_dir is None: - self.log_dir = self._default_log_dir() + self.log_dir = os.path.join(os.sep, "var", "tmp", "daos_testing") if self.shared_dir is None: - self.shared_dir = self._default_shared_dir() + self.shared_dir = os.path.expanduser(os.path.join("~", "daos_test")) if self.app_dir is None: - self.app_dir = self._default_app_dir() + self.app_dir = os.path.join(self.shared_dir, "daos_test", "apps") if self.user_dir is None: - self.user_dir = self._default_user_dir() + self.user_dir = os.path.join(self.log_dir, "user") if self.interface is None: self.interface = self._default_interface(logger, all_hosts) if self.provider is None: self.provider = self._default_provider(logger, servers) if self.insecure_mode is None: - self.insecure_mode = self._default_insecure_mode() + self.insecure_mode = "True" if self.bullseye_src is None: - self.bullseye_src = self._default_bullseye_src() + self.bullseye_src = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "..", "test.cov") if self.bullseye_file is None: - self.bullseye_file = self._default_bullseye_file() + self.bullseye_file = os.path.join(os.sep, "tmp", "test.cov") if self.daos_prefix is None: self.daos_prefix = self._default_daos_prefix(logger) if self.agent_user is None: - self.agent_user = self._default_agent_user() - if self.systemd_library_path is None: - self.systemd_library_path = self._default_systemd_library_path() + self.agent_user = 'root' + if self.control_config is None: + self.control_config = os.path.join(self.log_dir, "configs", "daos_control.yml") + if self.agent_config is None: + self.agent_config = os.path.join(self.log_dir, "configs", "daos_agent.yml") + if self.server_config is None: + self.server_config = os.path.join(self.log_dir, "configs", "daos_server.yml") def __set_value(self, key, value): """Set the test environment variable. @@ -209,14 +218,6 @@ def app_dir(self, value): """ self.__set_value('app_dir', value) - def _default_app_dir(self): - """Get the default application directory path. - - Returns: - str: the default application directory path - """ - return os.path.join(self.shared_dir, "daos_test", "apps") - @property def app_src(self): """Get the location from which to copy test applications. @@ -253,15 +254,6 @@ def log_dir(self, value): """ self.__set_value('log_dir', value) - @staticmethod - def _default_log_dir(): - """Get the default local log directory path. - - Returns: - str: the default local log directory path - """ - return os.path.join(os.sep, "var", "tmp", "daos_testing") - @property def shared_dir(self): """Get the shared log directory path. @@ -280,15 +272,6 @@ def shared_dir(self, value): """ self.__set_value('shared_dir', value) - @staticmethod - def _default_shared_dir(): - """Get the default shared log directory path. - - Returns: - str: the default shared log directory path - """ - return os.path.expanduser(os.path.join("~", "daos_test")) - @property def user_dir(self): """Get the user directory path. @@ -307,14 +290,6 @@ def user_dir(self, value): """ self.__set_value('user_dir', value) - def _default_user_dir(self): - """Get the default user directory path. - - Returns: - str: the default user directory path - """ - return os.path.join(self.log_dir, "user") - @property def interface(self): """Get the interface device. @@ -352,7 +327,7 @@ def _default_interface(self, logger, hosts): # Find all the /sys/class/net interfaces on the launch node (excluding lo) logger.debug("Detecting network devices - D_INTERFACE not set") try: - interface = get_fastest_interface(logger, hosts) + interface = get_fastest_interface(logger, hosts | get_local_host()) except NetworkException as error: raise TestEnvironmentException("Error obtaining a default interface!") from error return interface @@ -447,15 +422,6 @@ def insecure_mode(self, value): """ self.__set_value('insecure_mode', value) - @staticmethod - def _default_insecure_mode(): - """Get the default insecure mode. - - Returns: - str: the default insecure mode - """ - return "True" - @property def bullseye_src(self): """Get the bullseye source file. @@ -474,15 +440,6 @@ def bullseye_src(self, value): """ self.__set_value('bullseye_src', value) - @staticmethod - def _default_bullseye_src(): - """Get the default bullseye source file. - - Returns: - str: the default bullseye source file - """ - return os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test.cov") - @property def bullseye_file(self): """Get the bullseye file. @@ -501,15 +458,6 @@ def bullseye_file(self, value): """ self.__set_value('bullseye_file', value) - @staticmethod - def _default_bullseye_file(): - """Get the default bullseye file. - - Returns: - str: the default bullseye file - """ - return os.path.join(os.sep, "tmp", "test.cov") - @property def daos_prefix(self): """Get the daos_prefix. @@ -572,15 +520,6 @@ def agent_user(self, value): """ self.__set_value('agent_user', value) - @staticmethod - def _default_agent_user(): - """Get the default daos_agent user. - - Returns: - str: the default daos_agent user - """ - return 'root' - @property def systemd_library_path(self): """Get the systemd LD_LIBRARY_PATH. @@ -599,14 +538,71 @@ def systemd_library_path(self, value): """ self.__set_value('systemd_library_path', value) - @staticmethod - def _default_systemd_library_path(): - """Get the default systemd LD_LIBRARY_PATH. + @property + def control_config(self): + """Get the control config file used in testing. + + Returns: + str: the control config file + """ + return os.environ.get(self.__ENV_VAR_MAP['control_config']) + + @control_config.setter + def control_config(self, value): + """Set the control config file used in testing. + + Args: + value (str): the control config file + """ + self.__set_value('control_config', value) + + @property + def agent_config(self): + """Get the agent config file used in testing. + + Returns: + str: the agent config file + """ + return os.environ.get(self.__ENV_VAR_MAP['agent_config']) + + @agent_config.setter + def agent_config(self, value): + """Set the agent config file used in testing. + + Args: + value (str): the agent config file + """ + self.__set_value('agent_config', value) + + @property + def server_config(self): + """Get the server config file used in testing. + + Returns: + str: the server config file + """ + return os.environ.get(self.__ENV_VAR_MAP['server_config']) + + @server_config.setter + def server_config(self, value): + """Set the server config file used in testing. + + Args: + value (str): the server config file + """ + self.__set_value('server_config', value) + + def config_file_directories(self): + """Get the unique list of directories for the client, control, and server config files. Returns: - str: the default systemd LD_LIBRARY_PATH + list: a list of directories for the client, control, and server config files """ - return None + directories = set() + directories.add(os.path.dirname(self.agent_config)) + directories.add(os.path.dirname(self.control_config)) + directories.add(os.path.dirname(self.server_config)) + return list(directories) def set_test_environment(logger, test_env=None, servers=None, clients=None, provider=None, diff --git a/src/tests/ftest/util/fault_config_utils.py b/src/tests/ftest/util/fault_config_utils.py index b6d437c2e29..5dd3071271f 100644 --- a/src/tests/ftest/util/fault_config_utils.py +++ b/src/tests/ftest/util/fault_config_utils.py @@ -8,7 +8,7 @@ import os import yaml -from general_utils import distribute_files +from file_utils import distribute_files from run_utils import run_local, run_remote # a lookup table of predefined faults @@ -295,15 +295,22 @@ def start(self, fault_list, test_dir): # orterun or something, could re-evaluate this later self.write_fault_file(None) - def copy_fault_files(self, hosts): + def copy_fault_files(self, logger, hosts): """Copy the fault injection file to all test hosts. Args: + logger (Logger): logger for the messages produced by this method hosts (list): list of hosts to copy the fault injection file + + Raises: + FaultInjectionFailed: if there is an error copying the fault injection files """ if self._fault_list: self._hosts = hosts - distribute_files(self._hosts, self.fault_file, self.fault_file) + result = distribute_files(logger, self._hosts, self.fault_file, self.fault_file) + if not result.passed: + raise FaultInjectionFailed( + f"Error copying fault injection files to {result.failed_hosts}") def stop(self): """Remove the fault injection file created during testing. diff --git a/src/tests/ftest/util/file_utils.py b/src/tests/ftest/util/file_utils.py new file mode 100644 index 00000000000..13f7e45a382 --- /dev/null +++ b/src/tests/ftest/util/file_utils.py @@ -0,0 +1,147 @@ +""" + (C) Copyright 2018-2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" + +import os + +# pylint: disable=import-error,no-name-in-module +from util.host_utils import get_local_host +from util.run_utils import command_as_user, get_clush_command, run_local, run_remote +from util.user_utils import get_chown_command, get_primary_group + + +def __run_command(logger, hosts, command, verbose=True, timeout=15): + """Run the command locally if there are no remote hosts or remotely. + + Args: + logger (Logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to run the command + command (str): command to run + verbose (bool, optional): log the command output. Defaults to True. + timeout (int, optional): command timeout. Defaults to 15 seconds. + + Returns: + CommandResult: groups of command results from the same hosts with the same return status + """ + if not hosts.difference(get_local_host()): + return run_local(logger, command, verbose, timeout) + return run_remote(logger, hosts, command, verbose, timeout) + + +def create_directory(logger, hosts, directory, timeout=15, verbose=True, user=None): + """Create the specified directory on the specified hosts. + + Args: + logger (Logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to create the directory + directory (str): the directory to create + timeout (int, optional): command timeout. Defaults to 15 seconds. + verbose (bool, optional): log the command output. Defaults to True. + user (str, optional): user with which to run the command. Defaults to None. + + Returns: + CommandResult: groups of command results from the same hosts with the same return status + """ + command = command_as_user(f"mkdir -p {directory}", user) + return __run_command(logger, hosts, command, verbose, timeout) + + +def change_file_owner(logger, hosts, filename, owner, group, timeout=15, verbose=True, user=None): + """Create the specified directory on the specified hosts. + + Args: + logger (Logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to create the directory + filename (str): the file for which to change ownership + owner (str): new owner of the file + group (str): new group owner of the file + timeout (int, optional): command timeout. Defaults to 15 seconds. + verbose (bool, optional): log the command output. Defaults to True. + user (str, optional): user with which to run the command. Defaults to None. + + Returns: + CommandResult: groups of command results from the same hosts with the same return status + """ + command = command_as_user(get_chown_command(owner, group, file=filename), user) + return __run_command(logger, hosts, command, verbose, timeout) + + +def get_file_size(logger, host, file_name): + """Obtain the file size on the specified host. + + Args: + logger (Logger): logger for the messages produced by this method + host (NodeSet): host from which to get the file size + file_name (str): name of remote file + + Returns: + CommandResult: groups of command results from the same hosts with the same return status + """ + return __run_command(logger, host, f"stat -c%s {file_name}") + + +def distribute_files(logger, hosts, source, destination, mkdir=True, timeout=60, + verbose=True, sudo=False, owner=None): + """Copy the source to the destination on each of the specified hosts. + + Optionally (by default) ensure the destination directory exists on each of + the specified hosts prior to copying the source. + + Args: + logger (Logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to copy the source + source (str): the file to copy to the hosts + destination (str): the host location in which to copy the source + mkdir (bool, optional): whether or not to ensure the destination + directory exists on hosts prior to copying the source. Defaults to + True. + timeout (int, optional): command timeout. Defaults to 60 seconds. + verbose (bool, optional): whether to log the command run and + stdout/stderr. Defaults to True. + raise_exception (bool, optional): whether to raise an exception if the + command returns a non-zero exit status. Defaults to True. + sudo (bool, optional): whether to run the command via sudo. Defaults to + False. + owner (str, optional): if specified the owner to assign as the owner of + the copied file. Defaults to None. + + Returns: + CommandResult: groups of command results from the same hosts with the same return status + """ + result = None + if mkdir: + result = create_directory(logger, hosts, os.path.dirname(destination), timeout, verbose) + + if result is None or result.passed: + if sudo: + # In order to copy a protected file to a remote host in CI the source will first be + # copied as is to the remote host + other_hosts = hosts.difference(get_local_host()) + if other_hosts: + # Existing files with strict file permissions can cause the subsequent non-sudo + # copy to fail, so remove the file first + _rm_command = command_as_user(f"rm -f {source}", "root") + run_remote(logger, other_hosts, _rm_command, verbose, timeout) + result = distribute_files( + logger, other_hosts, source, source, mkdir=True, + timeout=timeout, verbose=verbose, sudo=False, owner=None) + + if result is None or result.passed: + # Then a local sudo copy will be executed on the remote node to copy the source + # to the destination + _cp_command = command_as_user(f"cp {source} {destination}", "root") + result = run_remote(logger, hosts, _cp_command, verbose, timeout) + else: + # Without the sudo requirement copy the source to the destination directly with clush + _command = get_clush_command(hosts, args=f"-S -v --copy {source} --dest {destination}") + result = run_local(logger, _command, verbose, timeout) + + # If requested update the ownership of the destination file + if owner is not None and result.passed: + result = change_file_owner( + logger, hosts, destination, owner, get_primary_group(owner), timeout, verbose, + "root" if sudo else None) + + return result diff --git a/src/tests/ftest/util/general_utils.py b/src/tests/ftest/util/general_utils.py index 7ef3b73572d..84e55601ff2 100644 --- a/src/tests/ftest/util/general_utils.py +++ b/src/tests/ftest/util/general_utils.py @@ -16,15 +16,13 @@ from getpass import getuser from importlib import import_module from logging import getLogger -from socket import gethostname from avocado.core.settings import settings from avocado.core.version import MAJOR from avocado.utils import process from ClusterShell.NodeSet import NodeSet from ClusterShell.Task import task_self -from run_utils import command_as_user, get_clush_command, run_local, run_remote -from user_utils import get_chown_command, get_primary_group +from run_utils import command_as_user, run_local, run_remote class DaosTestError(Exception): @@ -891,177 +889,20 @@ def convert_string(item, separator=","): return item -def create_directory(hosts, directory, timeout=15, verbose=True, - raise_exception=True, sudo=False): - """Create the specified directory on the specified hosts. - - Args: - hosts (NodeSet): hosts on which to create the directory - directory (str): the directory to create - timeout (int, optional): command timeout. Defaults to 15 seconds. - verbose (bool, optional): whether to log the command run and - stdout/stderr. Defaults to True. - raise_exception (bool, optional): whether to raise an exception if the - command returns a non-zero exit status. Defaults to True. - sudo (bool, optional): whether to run the command via sudo. Defaults to - False. - - Raises: - DaosTestError: if there is an error running the command - - Returns: - CmdResult: an avocado.utils.process CmdResult object containing the - result of the command execution. A CmdResult object has the - following properties: - command - command string - exit_status - exit_status of the command - stdout - the stdout - stderr - the stderr - duration - command execution time - interrupted - whether the command completed within timeout - pid - command's pid - - """ - mkdir_command = "/usr/bin/mkdir -p {}".format(directory) - command = get_clush_command(hosts, args="-S -v", command=mkdir_command, command_sudo=sudo) - return run_command(command, timeout=timeout, verbose=verbose, raise_exception=raise_exception) - - -def change_file_owner(hosts, filename, owner, group, timeout=15, verbose=True, - raise_exception=True, sudo=False): - """Create the specified directory on the specified hosts. - - Args: - hosts (NodeSet): hosts on which to create the directory - filename (str): the file for which to change ownership - owner (str): new owner of the file - group (str): new group owner of the file - timeout (int, optional): command timeout. Defaults to 15 seconds. - verbose (bool, optional): whether to log the command run and - stdout/stderr. Defaults to True. - raise_exception (bool, optional): whether to raise an exception if the - command returns a non-zero exit status. Defaults to True. - sudo (bool, optional): whether to run the command via sudo. Defaults to - False. - - Raises: - DaosTestError: if there is an error running the command - - Returns: - CmdResult: an avocado.utils.process CmdResult object containing the - result of the command execution. A CmdResult object has the - following properties: - command - command string - exit_status - exit_status of the command - stdout - the stdout - stderr - the stderr - duration - command execution time - interrupted - whether the command completed within timeout - pid - command's pid - - """ - chown_command = get_chown_command(owner, group, file=filename) - command = get_clush_command(hosts, args="-S -v", command=chown_command, command_sudo=sudo) - return run_command(command, timeout=timeout, verbose=verbose, raise_exception=raise_exception) - - -def distribute_files(hosts, source, destination, mkdir=True, timeout=60, - verbose=True, raise_exception=True, sudo=False, - owner=None): - """Copy the source to the destination on each of the specified hosts. - - Optionally (by default) ensure the destination directory exists on each of - the specified hosts prior to copying the source. - - Args: - hosts (NodeSet): hosts on which to copy the source - source (str): the file to copy to the hosts - destination (str): the host location in which to copy the source - mkdir (bool, optional): whether or not to ensure the destination - directory exists on hosts prior to copying the source. Defaults to - True. - timeout (int, optional): command timeout. Defaults to 60 seconds. - verbose (bool, optional): whether to log the command run and - stdout/stderr. Defaults to True. - raise_exception (bool, optional): whether to raise an exception if the - command returns a non-zero exit status. Defaults to True. - sudo (bool, optional): whether to run the command via sudo. Defaults to - False. - owner (str, optional): if specified the owner to assign as the owner of - the copied file. Defaults to None. - - Raises: - DaosTestError: if there is an error running the command - - Returns: - CmdResult: an avocado.utils.process CmdResult object containing the - result of the command execution. A CmdResult object has the - following properties: - command - command string - exit_status - exit_status of the command - stdout - the stdout - stderr - the stderr - duration - command execution time - interrupted - whether the command completed within timeout - pid - command's pid - - """ - result = None - if mkdir: - result = create_directory( - hosts, os.path.dirname(destination), verbose=verbose, - raise_exception=raise_exception) - if result is None or result.exit_status == 0: - if sudo: - # In order to copy a protected file to a remote host in CI the - # source will first be copied as is to the remote host - localhost = gethostname().split(".")[0] - other_hosts = NodeSet.fromlist([host for host in hosts if host != localhost]) - if other_hosts: - # Existing files with strict file permissions can cause the - # subsequent non-sudo copy to fail, so remove the file first - rm_command = get_clush_command( - other_hosts, args="-S -v", command="rm -f {}".format(source), - command_sudo=True) - run_command(rm_command, verbose=verbose, raise_exception=False) - result = distribute_files( - other_hosts, source, source, mkdir=True, - timeout=timeout, verbose=verbose, - raise_exception=raise_exception, sudo=False, owner=None) - if result is None or result.exit_status == 0: - # Then a local sudo copy will be executed on the remote node to - # copy the source to the destination - command = get_clush_command( - hosts, args="-S -v", command="cp {} {}".format(source, destination), - command_sudo=True) - result = run_command(command, timeout, verbose, raise_exception) - else: - # Without the sudo requirement copy the source to the destination - # directly with clush - command = get_clush_command( - hosts, args="-S -v --copy {} --dest {}".format(source, destination)) - result = run_command(command, timeout, verbose, raise_exception) - - # If requested update the ownership of the destination file - if owner is not None and result.exit_status == 0: - change_file_owner( - hosts, destination, owner, get_primary_group(owner), timeout=timeout, - verbose=verbose, raise_exception=raise_exception, sudo=sudo) - return result - - -def get_default_config_file(name): +def get_default_config_file(name, path=None): """Get the default config file. Args: name (str): daos component name, e.g. server, agent, control + path (str, optional): path to use for the config file. Defaults to None which will use the + /etc/daos default. Returns: str: the default config file - """ - file_name = "".join(["daos_", name, ".yml"]) - return os.path.join(os.sep, "etc", "daos", file_name) + if path is None: + path = os.path.join(os.sep, "etc", "daos") + return os.path.join(path, f"daos_{name}.yml") def get_file_listing(hosts, files, user): diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py index a86df01ea55..0f7284c50ef 100644 --- a/src/tests/ftest/util/launch_utils.py +++ b/src/tests/ftest/util/launch_utils.py @@ -20,9 +20,10 @@ from util.host_utils import HostException, HostInfo, get_local_host, get_node_set from util.logger_utils import LOG_FILE_FORMAT, get_file_handler from util.results_utils import LaunchTestName -from util.run_utils import RunException, run_local, run_remote +from util.run_utils import RunException, command_as_user, run_local, run_remote from util.slurm_utils import create_partition, delete_partition, show_partition from util.storage_utils import StorageException, StorageInfo +from util.systemctl_utils import SystemctlFailure, create_override_config from util.user_utils import get_group_id, get_user_groups, groupadd, useradd, userdel from util.yaml_utils import YamlUpdater, get_yaml_data @@ -77,6 +78,66 @@ def setup_fuse_config(logger, hosts): raise LaunchException(f"Failed to setup {config}") +def __add_systemctl_override(logger, hosts, service, user, command, config, path, lib_path): + """Add a systemctl override file for the specified service. + + Args: + logger (Logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to create the systemctl config + service (str): service for which to issue the command + user (str): user to use to issue the command + service_command (str): full path to the service command + service_config (str): full path to the service config + path (str): the PATH variable to set in the systemd config. + ld_library_path (str): the LD_LIBRARY_PATH variable to set in the systemd config. + + Raises: + LaunchException: if setup fails + + Returns: + dict: a dictionary of the systemctl override config file key with a dictionary value + containing the related host and user information + """ + logger.debug("-" * 80) + logger.info("Setting up systemctl override for %s", service) + try: + systemctl_override = create_override_config( + logger, hosts, service, user, command, config, path, lib_path) + except SystemctlFailure as error: + raise LaunchException(f"Failed to setup systemctl config for {service}") from error + return {systemctl_override: {"hosts": hosts, "user": user}} + + +def setup_systemctl(logger, servers, clients, test_env): + """Set up the systemctl override files for the daos_server and daos_agent. + + Args: + logger (Logger): logger for the messages produced by this method + servers (NodeSet): hosts that may run the daos_server command + clients (NodeSet): hosts that may run the daos_agent command + test_env (TestEnvironment): the test environment + + Raises: + LaunchException: if setup fails + + Returns: + dict: a dictionary of systemctl override config file keys with NodeSet values identifying + the hosts on which to remove the config files at the end of testing + """ + systemctl_configs = {} + systemctl_configs.update( + __add_systemctl_override( + logger, servers, "daos_server.service", "root", + os.path.join(test_env.daos_prefix, "bin", "daos_server"), test_env.server_config, + None, None)) + systemctl_configs.update( + __add_systemctl_override( + logger, clients, "daos_agent.service", test_env.agent_user, + os.path.join(test_env.daos_prefix, "bin", "daos_agent"), test_env.agent_config, + None, None)) + return systemctl_configs + + def display_disk_space(logger, path): """Display disk space of provided path destination. @@ -544,12 +605,13 @@ def _setup_test_directory(self, logger, test): f"sudo -n rm -fr {test_env.log_dir}", f"mkdir -p {test_env.log_dir}", f"chmod a+wrx {test_env.log_dir}", - f"ls -al {test_env.log_dir}", - f"mkdir -p {test_env.user_dir}" ] # Predefine the sub directories used to collect the files process()/_archive_files() + directories = [test_env.user_dir] + test_env.config_file_directories() for directory in TEST_RESULTS_DIRS: - commands.append(f"mkdir -p {test_env.log_dir}/{directory}") + directories.append(os.path.join(test_env.log_dir, directory)) + commands.append(f"mkdir -p {' '.join(directories)}") + commands.append(f"ls -al {test_env.log_dir}") for command in commands: if not run_remote(logger, hosts, command).passed: message = "Error setting up the common test directory on all hosts" @@ -1161,7 +1223,7 @@ def _setup_application_directory(self, logger, result): def run_tests(self, logger, result, repeat, slurm_setup, sparse, fail_fast, stop_daos, archive, rename, jenkins_log, core_files, threshold, user_create, code_coverage, - job_results_dir, logdir, clear_mounts): + job_results_dir, logdir, clear_mounts, cleanup_files): # pylint: disable=too-many-arguments """Run all the tests. @@ -1183,6 +1245,7 @@ def run_tests(self, logger, result, repeat, slurm_setup, sparse, fail_fast, stop job_results_dir (str): avocado job-results directory logdir (str): base directory in which to place the log file clear_mounts (list): mount points to remove before each test + cleanup_files (dict): files to remove on specific hosts at the end of testing Returns: int: status code indicating any issues running tests @@ -1233,6 +1296,12 @@ def run_tests(self, logger, result, repeat, slurm_setup, sparse, fail_fast, stop # Stop logging to the test log file logger.removeHandler(test_file_handler) + # Cleanup any specified files at the end of testing + for file, info in cleanup_files.items(): + command = command_as_user(f"rm -fr {file}", info['user']) + if not run_remote(logger, info['hosts'], command).passed: + return_code |= 16 + # Collect code coverage files after all test have completed if not code_coverage.finalize(logger, job_results_dir, result.tests[0]): return_code |= 16 diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index d7e1efdd58a..6ae05af94e9 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -457,7 +457,8 @@ def support_collect_log(self, **kwargs): cmd = DaosServerCommand(self.manager.job.command_path) cmd.run_user = "daos_server" cmd.debug.value = False - kwargs['config'] = get_default_config_file("server") + kwargs['config'] = get_default_config_file( + "server", os.path.dirname(self.manager.job.yaml.filename)) cmd.set_command(("support", "collect-log"), **kwargs) self.log.info("Support collect-log on servers: %s", str(cmd)) return run_remote( diff --git a/src/tests/ftest/util/systemctl_utils.py b/src/tests/ftest/util/systemctl_utils.py new file mode 100644 index 00000000000..848b7dc6d31 --- /dev/null +++ b/src/tests/ftest/util/systemctl_utils.py @@ -0,0 +1,249 @@ +""" + (C) Copyright 2018-2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" + +import getpass +import os +import tempfile + +from ClusterShell.NodeSet import NodeSet +# pylint: disable=import-error,no-name-in-module +from util.file_utils import create_directory, distribute_files +from util.run_utils import command_as_user, run_remote + + +class SystemctlFailure(Exception): + """Base exception for this module.""" + + +def get_service_status(logger, hosts, service, user="root"): + """Get the status of the daos_server.service. + + Args: + logger (Logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to get the service state + service (str): name of the service + user (str, optional): user to use to issue the command. Defaults to "root". + + Returns: + dict: a dictionary with the following keys: + - "status": boolean set to True if status was obtained; False otherwise + - "stop": NodeSet where to stop the daos_server.service + - "disable": NodeSet where to disable the daos_server.service + - "reset-failed": NodeSet where to reset the daos_server.service + + """ + status = { + "status": True, + "stop": NodeSet(), + "disable": NodeSet(), + "reset-failed": NodeSet()} + status_states = { + "stop": ["active", "activating", "deactivating"], + "disable": ["active", "activating", "deactivating"], + "reset-failed": ["failed"]} + command = get_systemctl_command("is-active", service, user) + result = run_remote(logger, hosts, command, False) + for data in result.output: + if data.timeout: + status["status"] = False + status["stop"].add(data.hosts) + status["disable"].add(data.hosts) + status["reset-failed"].add(data.hosts) + logger.debug(" %s: TIMEOUT", data.hosts) + break + logger.debug(" %s: %s", data.hosts, "\n".join(data.stdout)) + for key, state_list in status_states.items(): + for line in data.stdout: + if line in state_list: + status[key].add(data.hosts) + break + return status + + +def stop_service(logger, hosts, service, user="root"): + """Stop any daos_server.service running on the hosts running servers. + + Args: + logger (Logger): logger for the messages produced by this method + hosts (NodeSet): list of hosts on which to stop the service. + service (str): name of the service + user (str, optional): user to use to issue the command. Defaults to "root". + + Returns: + bool: True if the service was successfully stopped; False otherwise + + """ + if not hosts: + logger.debug(" Skipping stopping %s service - no hosts", service) + return True + + result = {"status": True} + status_keys = ["reset-failed", "stop", "disable"] + mapping = {"stop": "active", "disable": "enabled", "reset-failed": "failed"} + check_hosts = NodeSet(hosts) + loop = 1 + # Reduce 'max_loops' to 2 once https://jira.hpdd.intel.com/browse/DAOS-7809 has been resolved + max_loops = 3 + while check_hosts: + # Check the status of the service on each host + result = get_service_status(logger, check_hosts, service) + check_hosts = NodeSet() + for key in status_keys: + if result[key]: + if loop == max_loops: + # Exit the while loop if the service is still running + logger.error( + " - Error %s still %s on %s", service, mapping[key], result[key]) + result["status"] = False + else: + # Issue the appropriate systemctl command to remedy the + # detected state, e.g. 'stop' for 'active'. + command = command_as_user(get_systemctl_command(key, service, user), user) + run_remote(logger, result[key], command) + + # Run the status check again on this group of hosts + check_hosts.add(result[key]) + loop += 1 + + return result["status"] + + +def get_systemctl_command(unit_command, service, user="root"): + """Get the systemctl command for the specified inputs. + + Args: + unit_command (str): command to issue for the service + service (str): service for which to issue the command + user (str, optional): user to use to issue the command. Defaults to "root". + + Returns: + str: the systemctl command for the specified service and user + """ + command = ["systemctl"] + if user != "root": + command.append(f"--user {user}") + if unit_command: + command.append(unit_command) + if service: + command.append(service) + return " ".join(command) + + +def get_service_file(logger, hosts, service, user, verbose=True, timeout=120): + """Get the service file. + + Args: + logger (Logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to run the command + service (str): service for which to issue the command + user (str, optional): user to use to issue the command. Defaults to "root". + verbose (bool, optional): log the command output. Defaults to True. + timeout (int, optional): number of seconds to wait for the command to complete. + Defaults to 120 seconds. + + Raises: + SystemctlFailure: if there is a problem obtaining the service file + + Returns: + str: the service file + """ + command = ' | '.join([ + get_systemctl_command("status", service, user), + "grep 'Loaded:'", + "grep -oE '/.*service'", + "xargs sh -c '[ -e \"$0\" ] && echo \"$0\"'" + ]) + result = run_remote(logger, hosts, command, verbose, timeout) + if not result.passed: + raise SystemctlFailure("Error obtaining the service file path") + if not result.homogeneous: + raise SystemctlFailure("Error obtaining a homogeneous service file path") + return list(result.all_stdout.values())[0].strip() + + +def create_override_config(logger, hosts, service, user, service_command, service_config, path, + ld_library_path, verbose=True, timeout=120): + """Create a systemctl override config file. + + Args: + logger (Logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to run the command + service (str): service for which to issue the command + user (str): user to use to issue the command + service_command (str): full path to the service command + service_config (str): full path to the service config + path (str): the PATH variable to set in the systemd config. + ld_library_path (str): the LD_LIBRARY_PATH variable to set in the systemd config. + verbose (bool, optional): log the command output. Defaults to True. + timeout (int, optional): number of seconds to wait for the command to complete. + Defaults to 120 seconds. + + Raises: + SystemctlFailure: if there are problems detecting, creating, or distributing the systemctl + override config file + + Returns: + str: the systemctl override config file path + """ + # Get the existing service file + service_file = get_service_file(logger, hosts, service, user, verbose, timeout) + + # Create the override directory + override_file = os.path.join(f"{service_file}.d", "override.conf") + result = create_directory(logger, hosts, os.path.dirname(override_file), timeout, verbose, user) + if not result.passed: + raise SystemctlFailure("Error creating the systemctl override config directory") + + # Create the override file - empty ExecStart clears the existing setting + override_contents = [ + "[Service]", + "ExecStart=", + f"ExecStart={service_command} start -o {service_config}" + ] + if path: + override_contents.append(f'Environment="PATH={path}"') + if ld_library_path: + override_contents.append(f'Environment="LD_LIBRARY_PATH={ld_library_path}"') + override_contents = "\n".join(override_contents) + "\n" + + with tempfile.NamedTemporaryFile() as temp: + temp.write(bytes(override_contents, encoding='utf-8')) + temp.flush() + os.chmod(temp.name, 0o644) + + _sudo = user != getpass.getuser() + _owner = user if _sudo else None + + result = distribute_files( + logger, hosts, temp.name, override_file, mkdir=False, verbose=verbose, sudo=_sudo, + owner=_owner) + if not result.passed: + raise SystemctlFailure( + "Error distributing the systemctl override config directory") + + # Reload on all hosts to pick up changes + if not daemon_reload(logger, hosts, user, verbose, timeout).passed: + raise SystemctlFailure("Error reloading systemctl daemon with override config directory") + + return override_file + + +def daemon_reload(logger, hosts, user, verbose=True, timeout=120): + """Run systemctl daemon-reload. + + Args: + logger (Logger): logger for the messages produced by this method + hosts (NodeSet): hosts on which to run the command + user (str, optional): user to use to issue the command. Defaults to "root". + verbose (bool, optional): log the command output. Defaults to True. + timeout (int, optional): number of seconds to wait for the command to complete. + Defaults to 120 seconds. + + Returns: + CommandResult: groups of command results from the same hosts with the same return status + """ + command = get_systemctl_command("daemon-reload", None, user) + return run_remote(logger, hosts, command_as_user(command, user), verbose, timeout)