daos-stack · daltonbohning · Aug 30, 2024 · Aug 22, 2024 · Aug 28, 2024 · Aug 29, 2024
@@ -10,7 +10,8 @@
 from apricot import TestWithServers
 from cmocka_utils import CmockaUtils
 from dfuse_utils import get_dfuse, start_dfuse
-from general_utils import create_directory, get_log_file
+from file_utils import create_directory
+from general_utils import get_log_file
 from job_manager_utils import get_job_manager
 
 
@@ -80,7 +81,9 @@ def run_test(self, il_lib=None):
         else:
             # Bypass, simply create a remote directory and use that.
             mount_dir = '/tmp/dfuse-test'
-            create_directory(self.hostlist_clients, mount_dir)
+            result = create_directory(self.log, self.hostlist_clients, mount_dir)
+            if not result.passed:
+                self.fail(f"Error creating {mount_dir} on {result.failed_hosts}")
 
         cmocka_utils = CmockaUtils(
             self.hostlist_clients, "dfuse", self.outputdir, self.test_dir, self.log)
@@ -118,7 +121,9 @@ def run_test(self, il_lib=None):
         else:
             # make D_IL_MOUNT_POINT different from mount_dir so it tests a non-DAOS filesystem
             dummy_dir = '/tmp/dummy'
-            create_directory(self.hostlist_clients, dummy_dir)
+            result = create_directory(self.log, self.hostlist_clients, dummy_dir)
+            if not result.passed:
+                self.fail(f"Error creating {dummy_dir} on {result.failed_hosts}")
             daos_test_env['D_IL_MOUNT_POINT'] = dummy_dir
         if cache_mode != 'writeback':
             command.append('--metadata')

@@ -21,7 +21,8 @@
 from util.code_coverage_utils import CodeCoverage
 from util.environment_utils import TestEnvironment, TestEnvironmentException, set_test_environment
 from util.host_utils import get_local_host
-from util.launch_utils import LaunchException, TestGroup, setup_fuse_config, summarize_run
+from util.launch_utils import (LaunchException, TestGroup, setup_fuse_config, setup_systemctl,
+                               summarize_run)
 from util.logger_utils import LOG_FILE_FORMAT, get_console_handler, get_file_handler
 from util.network_utils import PROVIDER_ALIAS, SUPPORTED_PROVIDERS
 from util.package_utils import find_packages
@@ -270,7 +271,8 @@ def _run(self, args):
         # pylint: disable=unsupported-binary-operation
         all_hosts = args.test_servers | args.test_clients | self.local_host
         self.details["installed packages"] = find_packages(
-            logger, all_hosts, "'^(daos|libfabric|mercury|ior|openmpi|mpifileutils)-'")
+            logger, all_hosts,
+            "'^(daos|libfabric|mercury|ior|openmpi|mpifileutils|mlnx-ofed-basic)-'")
 
         # Setup the test environment
         test_env = TestEnvironment()
@@ -325,6 +327,15 @@ def _run(self, args):
             message = "Issue detected setting up the fuse configuration"
             setup_result.warn_test(logger, "Setup", message, sys.exc_info())
 
+        # Setup override systemctl files
+        try:
+            clients = args.test_clients if args.test_clients else args.test_servers
+            cleanup_files = setup_systemctl(
+                logger, args.test_servers, clients | self.local_host, test_env)
+        except LaunchException:
+            message = "Issue detected setting up the systemctl configuration"
+            return self.get_exit_status(1, message, "Setup", sys.exc_info())
+
         # Get the core file pattern information
         core_files = {}
         if args.process_cores:
@@ -370,7 +381,7 @@ def _run(self, args):
             logger, self.result, self.repeat, self.slurm_setup, args.sparse, args.failfast,
             not args.disable_stop_daos, args.archive, args.rename, args.jenkinslog, core_files,
             args.logs_threshold, args.user_create, code_coverage, self.job_results_dir,
-            self.logdir, args.clear_mounts)
+            self.logdir, args.clear_mounts, cleanup_files)
 
         # Convert the test status to a launch.py status
         status |= summarize_run(logger, self.mode, test_status)

@@ -362,10 +362,11 @@ def test_destroy_wrong_group(self):
         server_group_b = self.server_group + "_b"
 
         # Prepare and configure dmg config files for a and b.
-        dmg_config_file_a = get_default_config_file(name="control_a")
+        config_path = os.path.dirname(self.test_env.control_config)
+        dmg_config_file_a = get_default_config_file(name="control_a", path=config_path)
         dmg_config_temp_a = self.get_config_file(
             name=server_group_a, command="dmg", path=self.test_dir)
-        dmg_config_file_b = get_default_config_file(name="control_b")
+        dmg_config_file_b = get_default_config_file(name="control_b", path=config_path)
         dmg_config_temp_b = self.get_config_file(
             name=server_group_b, command="dmg", path=self.test_dir)
 
@@ -393,7 +394,7 @@ def test_destroy_wrong_group(self):
 
         # Get dmg_c instance that uses daos_control_c.yml. Server group is b.
         cert_dir = os.path.join(os.sep, "etc", "daos", "certs")
-        dmg_config_file_c = get_default_config_file(name="control_c")
+        dmg_config_file_c = get_default_config_file(name="control_c", path=config_path)
         dmg_config_temp_c = self.get_config_file(
             name=server_group_b, command="dmg", path=self.test_dir)
         dmg_c = get_dmg_command(

@@ -10,10 +10,12 @@
 from ClusterShell.NodeSet import NodeSet
 from ddb_utils import DdbCommand
 from exception_utils import CommandFailure
-from general_utils import (DaosTestError, create_string_buffer, distribute_files,
-                           get_clush_command, get_random_string, report_errors, run_command)
+from file_utils import distribute_files
+from general_utils import (DaosTestError, create_string_buffer, get_random_string, report_errors,
+                           run_command)
 from pydaos.raw import DaosObjClass, IORequest
 from recovery_test_base import RecoveryTestBase
+from run_utils import get_clush_command
 
 
 def insert_objects(context, container, object_count, dkey_count, akey_count, base_dkey,
@@ -507,14 +509,9 @@ def test_recovery_ddb_load(self):
             file.write(new_data)
 
         # Copy the created file to server node.
-        try:
-            distribute_files(
-                hosts=host, source=load_file_path, destination=load_file_path,
-                mkdir=False)
-        except DaosTestError as error:
-            raise CommandFailure(
-                "ERROR: Copying new_data.txt to {0}: {1}".format(host, error)) \
-                from error
+        result = distribute_files(self.log, host, load_file_path, load_file_path, False)
+        if not result.passed:
+            raise CommandFailure(f"ERROR: Copying new_data.txt to {result.failed_hosts}")
 
         # The file with the new data is ready. Run ddb load.
         ddb_command.value_load(component_path="[0]/[0]/[0]/[0]", load_file_path=load_file_path)

@@ -6,7 +6,6 @@
 import os
 import re
 import socket
-from getpass import getuser
 
 from agent_utils_params import DaosAgentTransportCredentials, DaosAgentYamlParameters
 from ClusterShell.NodeSet import NodeSet
@@ -289,7 +288,7 @@ def start(self):
 
         # Verify the socket directory exists when using a non-systemctl manager
         if self.verify_socket_dir:
-            self.verify_socket_directory(getuser())
+            self.verify_socket_directory(self.manager.job.certificate_owner)
 
         super().start()
 
@@ -319,7 +318,8 @@ def support_collect_log(self, **kwargs):
         """
         cmd = self.manager.job.copy()
         cmd.debug.value = False
-        cmd.config.value = get_default_config_file("agent")
+        cmd.config.value = get_default_config_file(
+            "agent", os.path.dirname(self.manager.job.yaml.filename))
         cmd.set_command(("support", "collect-log"), **kwargs)
         self.log.info("Support collect-log on clients: %s", str(cmd))
         return run_remote(self.log, self.hosts, cmd.with_exports)

@@ -26,8 +26,7 @@
 from exception_utils import CommandFailure
 from fault_config_utils import FaultInjection
 from general_utils import (dict_to_str, dump_engines_stacks, get_avocado_config_value,
-                           get_default_config_file, get_file_listing, nodeset_append_suffix,
-                           set_avocado_config_value)
+                           nodeset_append_suffix, set_avocado_config_value)
 from host_utils import HostException, HostInfo, HostRole, get_host_parameters, get_local_host
 from logger_utils import TestLogger
 from pydaos.raw import DaosApiError, DaosContext, DaosLog
@@ -762,13 +761,17 @@ def setUp(self):
             self.fail(f"Error creating test-specific temporary directory on {result.failed_hosts}")
 
         # Copy the fault injection files to the hosts.
-        self.fault_injection.copy_fault_files(self.host_info.all_hosts)
+        self.fault_injection.copy_fault_files(self.log, self.host_info.all_hosts)
 
         # List common test directory contents before running the test
         self.log.info("-" * 100)
-        self.log.debug("Common test directory (%s) contents:", self.test_dir)
+        self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir))
         all_hosts = include_local_host(self.host_info.all_hosts)
-        get_file_listing(all_hosts, self.test_dir, self.test_env.agent_user).log_output(self.log)
+        test_dir_parent = os.path.dirname(self.test_dir)
+        result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}")
+        if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90:
+            run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*")
+        self.log.info("-" * 100)
 
         if not self.start_servers_once or self.name.uid == 1:
             # Kill commands left running on the hosts (from a previous test)
@@ -1063,7 +1066,7 @@ def add_agent_manager(self, group=None, config_file=None, config_temp=None):
         if group is None:
             group = self.server_group
         if config_file is None and self.agent_manager_class == "Systemctl":
-            config_file = get_default_config_file("agent")
+            config_file = self.test_env.agent_config
             config_temp = self.get_config_file(group, "agent", self.test_dir)
         elif config_file is None:
             config_file = self.get_config_file(group, "agent")
@@ -1113,14 +1116,14 @@ def add_server_manager(self, group=None, svr_config_file=None,
         if group is None:
             group = self.server_group
         if svr_config_file is None and self.server_manager_class == "Systemctl":
-            svr_config_file = get_default_config_file("server")
+            svr_config_file = self.test_env.server_config
             svr_config_temp = self.get_config_file(
                 group, "server", self.test_dir)
         elif svr_config_file is None:
             svr_config_file = self.get_config_file(group, "server")
             svr_config_temp = None
         if dmg_config_file is None and self.server_manager_class == "Systemctl":
-            dmg_config_file = get_default_config_file("control")
+            dmg_config_file = self.test_env.control_config
             dmg_config_temp = self.get_config_file(group, "dmg", self.test_dir)
         elif dmg_config_file is None:
             dmg_config_file = self.get_config_file(group, "dmg")
@@ -1669,7 +1672,7 @@ def get_dmg_command(self, index=0):
             return self.server_managers[index].dmg
 
         if self.server_manager_class == "Systemctl":
-            dmg_config_file = get_default_config_file("control")
+            dmg_config_file = self.test_env.control_config
             dmg_config_temp = self.get_config_file("daos", "dmg", self.test_dir)
             dmg_cert_dir = os.path.join(os.sep, "etc", "daos", "certs")
         else:

@@ -17,6 +17,7 @@
 from util.environment_utils import TestEnvironment
 from util.host_utils import get_local_host
 from util.run_utils import find_command, run_local, run_remote, stop_processes
+from util.systemctl_utils import stop_service
 from util.user_utils import get_chown_command
 from util.yaml_utils import get_test_category
 
@@ -66,97 +67,6 @@ def stop_daos_server_service(logger, test):
     return stop_service(logger, hosts, service)
 
 
-def stop_service(logger, hosts, service):
-    """Stop any daos_server.service running on the hosts running servers.
-
-    Args:
-        logger (Logger): logger for the messages produced by this method
-        hosts (NodeSet): list of hosts on which to stop the service.
-        service (str): name of the service
-
-    Returns:
-        bool: True if the service was successfully stopped; False otherwise
-
-    """
-    result = {"status": True}
-    if hosts:
-        status_keys = ["reset-failed", "stop", "disable"]
-        mapping = {"stop": "active", "disable": "enabled", "reset-failed": "failed"}
-        check_hosts = NodeSet(hosts)
-        loop = 1
-        # Reduce 'max_loops' to 2 once https://jira.hpdd.intel.com/browse/DAOS-7809
-        # has been resolved
-        max_loops = 3
-        while check_hosts:
-            # Check the status of the service on each host
-            result = get_service_status(logger, check_hosts, service)
-            check_hosts = NodeSet()
-            for key in status_keys:
-                if result[key]:
-                    if loop == max_loops:
-                        # Exit the while loop if the service is still running
-                        logger.error(
-                            " - Error %s still %s on %s", service, mapping[key], result[key])
-                        result["status"] = False
-                    else:
-                        # Issue the appropriate systemctl command to remedy the
-                        # detected state, e.g. 'stop' for 'active'.
-                        command = ["sudo", "-n", "systemctl", key, service]
-                        run_remote(logger, result[key], " ".join(command))
-
-                        # Run the status check again on this group of hosts
-                        check_hosts.add(result[key])
-            loop += 1
-    else:
-        logger.debug("  Skipping stopping %s service - no hosts", service)
-
-    return result["status"]
-
-
-def get_service_status(logger, hosts, service):
-    """Get the status of the daos_server.service.
-
-    Args:
-        logger (Logger): logger for the messages produced by this method
-        hosts (NodeSet): hosts on which to get the service state
-        service (str): name of the service
-
-    Returns:
-        dict: a dictionary with the following keys:
-            - "status":       boolean set to True if status was obtained; False otherwise
-            - "stop":         NodeSet where to stop the daos_server.service
-            - "disable":      NodeSet where to disable the daos_server.service
-            - "reset-failed": NodeSet where to reset the daos_server.service
-
-    """
-    status = {
-        "status": True,
-        "stop": NodeSet(),
-        "disable": NodeSet(),
-        "reset-failed": NodeSet()}
-    status_states = {
-        "stop": ["active", "activating", "deactivating"],
-        "disable": ["active", "activating", "deactivating"],
-        "reset-failed": ["failed"]}
-    command = ["systemctl", "is-active", service]
-    result = run_remote(logger, hosts, " ".join(command))
-    for data in result.output:
-        if data.timeout:
-            status["status"] = False
-            status["stop"].add(data.hosts)
-            status["disable"].add(data.hosts)
-            status["reset-failed"].add(data.hosts)
-            logger.debug("  %s: TIMEOUT", data.hosts)
-            break
-        logger.debug("  %s: %s", data.hosts, "\n".join(data.stdout))
-        for key, state_list in status_states.items():
-            for line in data.stdout:
-                if line in state_list:
-                    status[key].add(data.hosts)
-                    break
-    return status
-
-
 def reset_server_storage(logger, test):
     """Reset the server storage for the hosts that ran servers in the test.
 
@@ -981,14 +891,15 @@ def collect_test_result(logger, test, test_result, job_results_dir, stop_daos, a
             "depth": 1,
             "timeout": 300,
         }
-        remote_files["remote configuration files"] = {
-            "source": os.path.join(os.sep, "etc", "daos"),
-            "destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[0]),
-            "pattern": "daos_*.yml",
-            "hosts": test.host_info.all_hosts,
-            "depth": 1,
-            "timeout": 300,
-        }
+        for index, source in enumerate(test_env.config_file_directories()):
+            remote_files[f"remote configuration files ({index})"] = {
+                "source": source,
+                "destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[0]),
+                "pattern": "daos_*.yml",
+                "hosts": test.host_info.all_hosts,
+                "depth": 1,
+                "timeout": 300,
+            }
         remote_files["daos log files"] = {
             "source": test_env.log_dir,
             "destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[1]),