From cf7322a79f182781767d563cffa315562e7e68dc Mon Sep 17 00:00:00 2001
From: Phil Henderson <phillip.henderson@intel.com>
Date: Fri, 30 Aug 2024 10:24:16 -0400
Subject: [PATCH] DAOS-16366 test: Use agent/server config files from test
 directory (#14944) (#15033)

Use agent, control, and server config files placed in the common test directory instead of /etc/daos with a systemctl override configuration file.

Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
---
 src/tests/ftest/daos_test/dfuse.py           |  11 +-
 src/tests/ftest/launch.py                    |  17 +-
 src/tests/ftest/pool/destroy.py              |   7 +-
 src/tests/ftest/recovery/ddb.py              |  17 +-
 src/tests/ftest/util/agent_utils.py          |   6 +-
 src/tests/ftest/util/apricot/apricot/test.py |  21 +-
 src/tests/ftest/util/collection_utils.py     | 109 +-------
 src/tests/ftest/util/command_utils.py        |  49 ++--
 src/tests/ftest/util/environment_utils.py    | 168 ++++++-------
 src/tests/ftest/util/fault_config_utils.py   |  13 +-
 src/tests/ftest/util/file_utils.py           | 147 +++++++++++
 src/tests/ftest/util/general_utils.py        | 173 +------------
 src/tests/ftest/util/launch_utils.py         |  79 +++++-
 src/tests/ftest/util/server_utils.py         |   3 +-
 src/tests/ftest/util/systemctl_utils.py      | 249 +++++++++++++++++++
 15 files changed, 652 insertions(+), 417 deletions(-)
 create mode 100644 src/tests/ftest/util/file_utils.py
 create mode 100644 src/tests/ftest/util/systemctl_utils.py

diff --git a/src/tests/ftest/daos_test/dfuse.py b/src/tests/ftest/daos_test/dfuse.py
index 205d1cd11a6..e735e745cab 100644
--- a/src/tests/ftest/daos_test/dfuse.py
+++ b/src/tests/ftest/daos_test/dfuse.py
@@ -10,7 +10,8 @@
 from apricot import TestWithServers
 from cmocka_utils import CmockaUtils
 from dfuse_utils import get_dfuse, start_dfuse
-from general_utils import create_directory, get_log_file
+from file_utils import create_directory
+from general_utils import get_log_file
 from job_manager_utils import get_job_manager
 
 
@@ -80,7 +81,9 @@ def run_test(self, il_lib=None):
         else:
             # Bypass, simply create a remote directory and use that.
             mount_dir = '/tmp/dfuse-test'
-            create_directory(self.hostlist_clients, mount_dir)
+            result = create_directory(self.log, self.hostlist_clients, mount_dir)
+            if not result.passed:
+                self.fail(f"Error creating {mount_dir} on {result.failed_hosts}")
 
         cmocka_utils = CmockaUtils(
             self.hostlist_clients, "dfuse", self.outputdir, self.test_dir, self.log)
@@ -118,7 +121,9 @@ def run_test(self, il_lib=None):
         else:
             # make D_IL_MOUNT_POINT different from mount_dir so it tests a non-DAOS filesystem
             dummy_dir = '/tmp/dummy'
-            create_directory(self.hostlist_clients, dummy_dir)
+            result = create_directory(self.log, self.hostlist_clients, dummy_dir)
+            if not result.passed:
+                self.fail(f"Error creating {dummy_dir} on {result.failed_hosts}")
             daos_test_env['D_IL_MOUNT_POINT'] = dummy_dir
         if cache_mode != 'writeback':
             command.append('--metadata')
diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py
index ad5b034301d..f3daa8464d7 100755
--- a/src/tests/ftest/launch.py
+++ b/src/tests/ftest/launch.py
@@ -21,7 +21,8 @@
 from util.code_coverage_utils import CodeCoverage
 from util.environment_utils import TestEnvironment, TestEnvironmentException, set_test_environment
 from util.host_utils import get_local_host
-from util.launch_utils import LaunchException, TestGroup, setup_fuse_config, summarize_run
+from util.launch_utils import (LaunchException, TestGroup, setup_fuse_config, setup_systemctl,
+                               summarize_run)
 from util.logger_utils import LOG_FILE_FORMAT, get_console_handler, get_file_handler
 from util.network_utils import PROVIDER_ALIAS, SUPPORTED_PROVIDERS
 from util.package_utils import find_packages
@@ -270,7 +271,8 @@ def _run(self, args):
         # pylint: disable=unsupported-binary-operation
         all_hosts = args.test_servers | args.test_clients | self.local_host
         self.details["installed packages"] = find_packages(
-            logger, all_hosts, "'^(daos|libfabric|mercury|ior|openmpi|mpifileutils)-'")
+            logger, all_hosts,
+            "'^(daos|libfabric|mercury|ior|openmpi|mpifileutils|mlnx-ofed-basic)-'")
 
         # Setup the test environment
         test_env = TestEnvironment()
@@ -325,6 +327,15 @@ def _run(self, args):
             message = "Issue detected setting up the fuse configuration"
             setup_result.warn_test(logger, "Setup", message, sys.exc_info())
 
+        # Setup override systemctl files
+        try:
+            clients = args.test_clients if args.test_clients else args.test_servers
+            cleanup_files = setup_systemctl(
+                logger, args.test_servers, clients | self.local_host, test_env)
+        except LaunchException:
+            message = "Issue detected setting up the systemctl configuration"
+            return self.get_exit_status(1, message, "Setup", sys.exc_info())
+
         # Get the core file pattern information
         core_files = {}
         if args.process_cores:
@@ -370,7 +381,7 @@ def _run(self, args):
             logger, self.result, self.repeat, self.slurm_setup, args.sparse, args.failfast,
             not args.disable_stop_daos, args.archive, args.rename, args.jenkinslog, core_files,
             args.logs_threshold, args.user_create, code_coverage, self.job_results_dir,
-            self.logdir, args.clear_mounts)
+            self.logdir, args.clear_mounts, cleanup_files)
 
         # Convert the test status to a launch.py status
         status |= summarize_run(logger, self.mode, test_status)
diff --git a/src/tests/ftest/pool/destroy.py b/src/tests/ftest/pool/destroy.py
index 26280a74978..2bcd5f6cea7 100644
--- a/src/tests/ftest/pool/destroy.py
+++ b/src/tests/ftest/pool/destroy.py
@@ -362,10 +362,11 @@ def test_destroy_wrong_group(self):
         server_group_b = self.server_group + "_b"
 
         # Prepare and configure dmg config files for a and b.
-        dmg_config_file_a = get_default_config_file(name="control_a")
+        config_path = os.path.dirname(self.test_env.control_config)
+        dmg_config_file_a = get_default_config_file(name="control_a", path=config_path)
         dmg_config_temp_a = self.get_config_file(
             name=server_group_a, command="dmg", path=self.test_dir)
-        dmg_config_file_b = get_default_config_file(name="control_b")
+        dmg_config_file_b = get_default_config_file(name="control_b", path=config_path)
         dmg_config_temp_b = self.get_config_file(
             name=server_group_b, command="dmg", path=self.test_dir)
 
@@ -393,7 +394,7 @@ def test_destroy_wrong_group(self):
 
         # Get dmg_c instance that uses daos_control_c.yml. Server group is b.
         cert_dir = os.path.join(os.sep, "etc", "daos", "certs")
-        dmg_config_file_c = get_default_config_file(name="control_c")
+        dmg_config_file_c = get_default_config_file(name="control_c", path=config_path)
         dmg_config_temp_c = self.get_config_file(
             name=server_group_b, command="dmg", path=self.test_dir)
         dmg_c = get_dmg_command(
diff --git a/src/tests/ftest/recovery/ddb.py b/src/tests/ftest/recovery/ddb.py
index 8447db1873a..04df3184984 100644
--- a/src/tests/ftest/recovery/ddb.py
+++ b/src/tests/ftest/recovery/ddb.py
@@ -10,10 +10,12 @@
 from ClusterShell.NodeSet import NodeSet
 from ddb_utils import DdbCommand
 from exception_utils import CommandFailure
-from general_utils import (DaosTestError, create_string_buffer, distribute_files,
-                           get_clush_command, get_random_string, report_errors, run_command)
+from file_utils import distribute_files
+from general_utils import (DaosTestError, create_string_buffer, get_random_string, report_errors,
+                           run_command)
 from pydaos.raw import DaosObjClass, IORequest
 from recovery_test_base import RecoveryTestBase
+from run_utils import get_clush_command
 
 
 def insert_objects(context, container, object_count, dkey_count, akey_count, base_dkey,
@@ -507,14 +509,9 @@ def test_recovery_ddb_load(self):
             file.write(new_data)
 
         # Copy the created file to server node.
-        try:
-            distribute_files(
-                hosts=host, source=load_file_path, destination=load_file_path,
-                mkdir=False)
-        except DaosTestError as error:
-            raise CommandFailure(
-                "ERROR: Copying new_data.txt to {0}: {1}".format(host, error)) \
-                from error
+        result = distribute_files(self.log, host, load_file_path, load_file_path, False)
+        if not result.passed:
+            raise CommandFailure(f"ERROR: Copying new_data.txt to {result.failed_hosts}")
 
         # The file with the new data is ready. Run ddb load.
         ddb_command.value_load(component_path="[0]/[0]/[0]/[0]", load_file_path=load_file_path)
diff --git a/src/tests/ftest/util/agent_utils.py b/src/tests/ftest/util/agent_utils.py
index ff815356563..74b79fb9796 100644
--- a/src/tests/ftest/util/agent_utils.py
+++ b/src/tests/ftest/util/agent_utils.py
@@ -6,7 +6,6 @@
 import os
 import re
 import socket
-from getpass import getuser
 
 from agent_utils_params import DaosAgentTransportCredentials, DaosAgentYamlParameters
 from ClusterShell.NodeSet import NodeSet
@@ -289,7 +288,7 @@ def start(self):
 
         # Verify the socket directory exists when using a non-systemctl manager
         if self.verify_socket_dir:
-            self.verify_socket_directory(getuser())
+            self.verify_socket_directory(self.manager.job.certificate_owner)
 
         super().start()
 
@@ -319,7 +318,8 @@ def support_collect_log(self, **kwargs):
         """
         cmd = self.manager.job.copy()
         cmd.debug.value = False
-        cmd.config.value = get_default_config_file("agent")
+        cmd.config.value = get_default_config_file(
+            "agent", os.path.dirname(self.manager.job.yaml.filename))
         cmd.set_command(("support", "collect-log"), **kwargs)
         self.log.info("Support collect-log on clients: %s", str(cmd))
         return run_remote(self.log, self.hosts, cmd.with_exports)
diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py
index 7a5542c15af..42e05937f37 100644
--- a/src/tests/ftest/util/apricot/apricot/test.py
+++ b/src/tests/ftest/util/apricot/apricot/test.py
@@ -26,8 +26,7 @@
 from exception_utils import CommandFailure
 from fault_config_utils import FaultInjection
 from general_utils import (dict_to_str, dump_engines_stacks, get_avocado_config_value,
-                           get_default_config_file, get_file_listing, nodeset_append_suffix,
-                           set_avocado_config_value)
+                           nodeset_append_suffix, set_avocado_config_value)
 from host_utils import HostException, HostInfo, HostRole, get_host_parameters, get_local_host
 from logger_utils import TestLogger
 from pydaos.raw import DaosApiError, DaosContext, DaosLog
@@ -762,13 +761,17 @@ def setUp(self):
             self.fail(f"Error creating test-specific temporary directory on {result.failed_hosts}")
 
         # Copy the fault injection files to the hosts.
-        self.fault_injection.copy_fault_files(self.host_info.all_hosts)
+        self.fault_injection.copy_fault_files(self.log, self.host_info.all_hosts)
 
         # List common test directory contents before running the test
         self.log.info("-" * 100)
-        self.log.debug("Common test directory (%s) contents:", self.test_dir)
+        self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir))
         all_hosts = include_local_host(self.host_info.all_hosts)
-        get_file_listing(all_hosts, self.test_dir, self.test_env.agent_user).log_output(self.log)
+        test_dir_parent = os.path.dirname(self.test_dir)
+        result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}")
+        if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90:
+            run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*")
+        self.log.info("-" * 100)
 
         if not self.start_servers_once or self.name.uid == 1:
             # Kill commands left running on the hosts (from a previous test)
@@ -1063,7 +1066,7 @@ def add_agent_manager(self, group=None, config_file=None, config_temp=None):
         if group is None:
             group = self.server_group
         if config_file is None and self.agent_manager_class == "Systemctl":
-            config_file = get_default_config_file("agent")
+            config_file = self.test_env.agent_config
             config_temp = self.get_config_file(group, "agent", self.test_dir)
         elif config_file is None:
             config_file = self.get_config_file(group, "agent")
@@ -1113,14 +1116,14 @@ def add_server_manager(self, group=None, svr_config_file=None,
         if group is None:
             group = self.server_group
         if svr_config_file is None and self.server_manager_class == "Systemctl":
-            svr_config_file = get_default_config_file("server")
+            svr_config_file = self.test_env.server_config
             svr_config_temp = self.get_config_file(
                 group, "server", self.test_dir)
         elif svr_config_file is None:
             svr_config_file = self.get_config_file(group, "server")
             svr_config_temp = None
         if dmg_config_file is None and self.server_manager_class == "Systemctl":
-            dmg_config_file = get_default_config_file("control")
+            dmg_config_file = self.test_env.control_config
             dmg_config_temp = self.get_config_file(group, "dmg", self.test_dir)
         elif dmg_config_file is None:
             dmg_config_file = self.get_config_file(group, "dmg")
@@ -1668,7 +1671,7 @@ def get_dmg_command(self, index=0):
             return self.server_managers[index].dmg
 
         if self.server_manager_class == "Systemctl":
-            dmg_config_file = get_default_config_file("control")
+            dmg_config_file = self.test_env.control_config
             dmg_config_temp = self.get_config_file("daos", "dmg", self.test_dir)
             dmg_cert_dir = os.path.join(os.sep, "etc", "daos", "certs")
         else:
diff --git a/src/tests/ftest/util/collection_utils.py b/src/tests/ftest/util/collection_utils.py
index 3b054eb19e9..a900769f4d6 100644
--- a/src/tests/ftest/util/collection_utils.py
+++ b/src/tests/ftest/util/collection_utils.py
@@ -17,6 +17,7 @@
 from util.environment_utils import TestEnvironment
 from util.host_utils import get_local_host
 from util.run_utils import find_command, run_local, run_remote, stop_processes
+from util.systemctl_utils import stop_service
 from util.user_utils import get_chown_command
 from util.yaml_utils import get_test_category
 
@@ -66,97 +67,6 @@ def stop_daos_server_service(logger, test):
     return stop_service(logger, hosts, service)
 
 
-def stop_service(logger, hosts, service):
-    """Stop any daos_server.service running on the hosts running servers.
-
-    Args:
-        logger (Logger): logger for the messages produced by this method
-        hosts (NodeSet): list of hosts on which to stop the service.
-        service (str): name of the service
-
-    Returns:
-        bool: True if the service was successfully stopped; False otherwise
-
-    """
-    result = {"status": True}
-    if hosts:
-        status_keys = ["reset-failed", "stop", "disable"]
-        mapping = {"stop": "active", "disable": "enabled", "reset-failed": "failed"}
-        check_hosts = NodeSet(hosts)
-        loop = 1
-        # Reduce 'max_loops' to 2 once https://jira.hpdd.intel.com/browse/DAOS-7809
-        # has been resolved
-        max_loops = 3
-        while check_hosts:
-            # Check the status of the service on each host
-            result = get_service_status(logger, check_hosts, service)
-            check_hosts = NodeSet()
-            for key in status_keys:
-                if result[key]:
-                    if loop == max_loops:
-                        # Exit the while loop if the service is still running
-                        logger.error(
-                            " - Error %s still %s on %s", service, mapping[key], result[key])
-                        result["status"] = False
-                    else:
-                        # Issue the appropriate systemctl command to remedy the
-                        # detected state, e.g. 'stop' for 'active'.
-                        command = ["sudo", "-n", "systemctl", key, service]
-                        run_remote(logger, result[key], " ".join(command))
-
-                        # Run the status check again on this group of hosts
-                        check_hosts.add(result[key])
-            loop += 1
-    else:
-        logger.debug("  Skipping stopping %s service - no hosts", service)
-
-    return result["status"]
-
-
-def get_service_status(logger, hosts, service):
-    """Get the status of the daos_server.service.
-
-    Args:
-        logger (Logger): logger for the messages produced by this method
-        hosts (NodeSet): hosts on which to get the service state
-        service (str): name of the service
-
-    Returns:
-        dict: a dictionary with the following keys:
-            - "status":       boolean set to True if status was obtained; False otherwise
-            - "stop":         NodeSet where to stop the daos_server.service
-            - "disable":      NodeSet where to disable the daos_server.service
-            - "reset-failed": NodeSet where to reset the daos_server.service
-
-    """
-    status = {
-        "status": True,
-        "stop": NodeSet(),
-        "disable": NodeSet(),
-        "reset-failed": NodeSet()}
-    status_states = {
-        "stop": ["active", "activating", "deactivating"],
-        "disable": ["active", "activating", "deactivating"],
-        "reset-failed": ["failed"]}
-    command = ["systemctl", "is-active", service]
-    result = run_remote(logger, hosts, " ".join(command))
-    for data in result.output:
-        if data.timeout:
-            status["status"] = False
-            status["stop"].add(data.hosts)
-            status["disable"].add(data.hosts)
-            status["reset-failed"].add(data.hosts)
-            logger.debug("  %s: TIMEOUT", data.hosts)
-            break
-        logger.debug("  %s: %s", data.hosts, "\n".join(data.stdout))
-        for key, state_list in status_states.items():
-            for line in data.stdout:
-                if line in state_list:
-                    status[key].add(data.hosts)
-                    break
-    return status
-
-
 def reset_server_storage(logger, test):
     """Reset the server storage for the hosts that ran servers in the test.
 
@@ -981,14 +891,15 @@ def collect_test_result(logger, test, test_result, job_results_dir, stop_daos, a
             "depth": 1,
             "timeout": 300,
         }
-        remote_files["remote configuration files"] = {
-            "source": os.path.join(os.sep, "etc", "daos"),
-            "destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[0]),
-            "pattern": "daos_*.yml",
-            "hosts": test.host_info.all_hosts,
-            "depth": 1,
-            "timeout": 300,
-        }
+        for index, source in enumerate(test_env.config_file_directories()):
+            remote_files[f"remote configuration files ({index})"] = {
+                "source": source,
+                "destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[0]),
+                "pattern": "daos_*.yml",
+                "hosts": test.host_info.all_hosts,
+                "depth": 1,
+                "timeout": 300,
+            }
         remote_files["daos log files"] = {
             "source": test_env.log_dir,
             "destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[1]),
diff --git a/src/tests/ftest/util/command_utils.py b/src/tests/ftest/util/command_utils.py
index b22300974fb..06da3b27868 100644
--- a/src/tests/ftest/util/command_utils.py
+++ b/src/tests/ftest/util/command_utils.py
@@ -19,9 +19,9 @@
 from command_utils_base import (BasicParameter, CommandWithParameters, EnvironmentVariables,
                                 FormattedParameter, LogParameter, ObjectWithParameters)
 from exception_utils import CommandFailure
-from general_utils import (DaosTestError, change_file_owner, check_file_exists, create_directory,
-                           distribute_files, get_file_listing, get_job_manager_class,
-                           get_subprocess_stdout, run_command)
+from file_utils import change_file_owner, create_directory, distribute_files
+from general_utils import (DaosTestError, check_file_exists, get_file_listing,
+                           get_job_manager_class, get_subprocess_stdout, run_command)
 from run_utils import command_as_user, run_remote
 from user_utils import get_primary_group
 from yaml_utils import get_yaml_data
@@ -1017,19 +1017,17 @@ def copy_certificates(self, source, hosts):
                 self.log.debug("Copying certificates for %s:", self._command)
                 data = yaml.get_certificate_data(yaml.get_attribute_names(LogParameter))
                 for name in data:
-                    create_directory(hosts, name, verbose=True, raise_exception=False)
+                    create_directory(self.log, hosts, name, verbose=True)
                     for file_name in data[name]:
                         src_file = os.path.join(source, file_name)
                         dst_file = os.path.join(name, file_name)
                         self.log.debug("  %s -> %s", src_file, dst_file)
                         result = distribute_files(
-                            hosts, src_file, dst_file, mkdir=False,
-                            verbose=False, raise_exception=False, sudo=True,
-                            owner=self.certificate_owner)
-                        if result.exit_status != 0:
+                            self.log, hosts, src_file, dst_file, mkdir=False,
+                            verbose=False, sudo=True, owner=self.certificate_owner)
+                        if not result.passed:
                             self.log.info(
-                                "    WARNING: %s copy failed on %s:\n%s",
-                                dst_file, hosts, result)
+                                "    WARNING: %s copy failed on %s", dst_file, result.failed_hosts)
                     names.add(name)
             yaml = yaml.other_params
 
@@ -1051,21 +1049,18 @@ def copy_configuration(self, hosts):
 
         Raises:
             CommandFailure: if there is an error copying the configuration file
-
         """
         if self.yaml is not None and hasattr(self.yaml, "filename"):
             if self.temporary_file and hosts:
                 self.log.info(
                     "Copying %s yaml configuration file to %s on %s",
                     self.temporary_file, self.yaml.filename, hosts)
-                try:
-                    distribute_files(
-                        hosts, self.temporary_file, self.yaml.filename,
-                        verbose=False, sudo=True)
-                except DaosTestError as error:
+                result = distribute_files(
+                    self.log, hosts, self.temporary_file, self.yaml.filename, verbose=False,
+                    sudo=True)
+                if not result.passed:
                     raise CommandFailure(
-                        "ERROR: Copying yaml configuration file to {}: "
-                        "{}".format(hosts, error)) from error
+                        f"ERROR: Copying yaml configuration file to {result.failed_hosts}")
 
     def verify_socket_directory(self, user, hosts):
         """Verify the domain socket directory is present and owned by this user.
@@ -1088,15 +1083,17 @@ def verify_socket_directory(self, user, hosts):
                 self.log.info(
                     "%s: creating socket directory %s for user %s on %s",
                     self.command, directory, user, nodes)
-                try:
-                    create_directory(nodes, directory, sudo=True)
-                    change_file_owner(nodes, directory, user, get_primary_group(user), sudo=True)
-                except DaosTestError as error:
+                result = create_directory(self.log, nodes, directory, user="root")
+                if not result.passed:
+                    raise CommandFailure(
+                        f"{self.command}: error creating socket directory {directory} for user "
+                        f"{user} on {result.failed_hosts}")
+                result = change_file_owner(
+                    self.log, nodes, directory, user, get_primary_group(user), user="root")
+                if not result.passed:
                     raise CommandFailure(
-                        "{}: error setting up missing socket directory {} for "
-                        "user {} on {}:\n{}".format(
-                            self.command, directory, user, nodes,
-                            error)) from error
+                        f"{self.command}: error setting socket directory {directory} owner for "
+                        f"user {user} on {result.failed_hosts}")
 
     def get_socket_dir(self):
         """Get the socket directory.
diff --git a/src/tests/ftest/util/environment_utils.py b/src/tests/ftest/util/environment_utils.py
index 1e295be3815..e36d750500e 100644
--- a/src/tests/ftest/util/environment_utils.py
+++ b/src/tests/ftest/util/environment_utils.py
@@ -9,6 +9,7 @@
 
 from ClusterShell.NodeSet import NodeSet
 # pylint: disable=import-error,no-name-in-module
+from util.host_utils import get_local_host
 from util.network_utils import (PROVIDER_ALIAS, SUPPORTED_PROVIDERS, NetworkException,
                                 get_common_provider, get_fastest_interface)
 from util.run_utils import run_remote
@@ -109,6 +110,9 @@ class TestEnvironment():
         'daos_prefix': 'DAOS_TEST_PREFIX',
         'agent_user': 'DAOS_TEST_AGENT_USER',
         'systemd_library_path': 'DAOS_TEST_SYSTEMD_LIBRARY_PATH',
+        'control_config': 'DAOS_TEST_CONTROL_CONFIG',
+        'agent_config': 'DAOS_TEST_AGENT_CONFIG',
+        'server_config': 'DAOS_TEST_SERVER_CONFIG',
     }
 
     def __init__(self):
@@ -155,29 +159,34 @@ def set_defaults(self, logger, servers=None, clients=None, provider=None, insecu
 
         # Set defaults for any unset values
         if self.log_dir is None:
-            self.log_dir = self._default_log_dir()
+            self.log_dir = os.path.join(os.sep, "var", "tmp", "daos_testing")
         if self.shared_dir is None:
-            self.shared_dir = self._default_shared_dir()
+            self.shared_dir = os.path.expanduser(os.path.join("~", "daos_test"))
         if self.app_dir is None:
-            self.app_dir = self._default_app_dir()
+            self.app_dir = os.path.join(self.shared_dir, "daos_test", "apps")
         if self.user_dir is None:
-            self.user_dir = self._default_user_dir()
+            self.user_dir = os.path.join(self.log_dir, "user")
         if self.interface is None:
             self.interface = self._default_interface(logger, all_hosts)
         if self.provider is None:
             self.provider = self._default_provider(logger, servers)
         if self.insecure_mode is None:
-            self.insecure_mode = self._default_insecure_mode()
+            self.insecure_mode = "True"
         if self.bullseye_src is None:
-            self.bullseye_src = self._default_bullseye_src()
+            self.bullseye_src = os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "..", "test.cov")
         if self.bullseye_file is None:
-            self.bullseye_file = self._default_bullseye_file()
+            self.bullseye_file = os.path.join(os.sep, "tmp", "test.cov")
         if self.daos_prefix is None:
             self.daos_prefix = self._default_daos_prefix(logger)
         if self.agent_user is None:
-            self.agent_user = self._default_agent_user()
-        if self.systemd_library_path is None:
-            self.systemd_library_path = self._default_systemd_library_path()
+            self.agent_user = 'root'
+        if self.control_config is None:
+            self.control_config = os.path.join(self.log_dir, "configs", "daos_control.yml")
+        if self.agent_config is None:
+            self.agent_config = os.path.join(self.log_dir, "configs", "daos_agent.yml")
+        if self.server_config is None:
+            self.server_config = os.path.join(self.log_dir, "configs", "daos_server.yml")
 
     def __set_value(self, key, value):
         """Set the test environment variable.
@@ -209,14 +218,6 @@ def app_dir(self, value):
         """
         self.__set_value('app_dir', value)
 
-    def _default_app_dir(self):
-        """Get the default application directory path.
-
-        Returns:
-            str: the default application directory path
-        """
-        return os.path.join(self.shared_dir, "daos_test", "apps")
-
     @property
     def app_src(self):
         """Get the location from which to copy test applications.
@@ -253,15 +254,6 @@ def log_dir(self, value):
         """
         self.__set_value('log_dir', value)
 
-    @staticmethod
-    def _default_log_dir():
-        """Get the default local log directory path.
-
-        Returns:
-            str: the default local log directory path
-        """
-        return os.path.join(os.sep, "var", "tmp", "daos_testing")
-
     @property
     def shared_dir(self):
         """Get the shared log directory path.
@@ -280,15 +272,6 @@ def shared_dir(self, value):
         """
         self.__set_value('shared_dir', value)
 
-    @staticmethod
-    def _default_shared_dir():
-        """Get the default shared log directory path.
-
-        Returns:
-            str: the default shared log directory path
-        """
-        return os.path.expanduser(os.path.join("~", "daos_test"))
-
     @property
     def user_dir(self):
         """Get the user directory path.
@@ -307,14 +290,6 @@ def user_dir(self, value):
         """
         self.__set_value('user_dir', value)
 
-    def _default_user_dir(self):
-        """Get the default user directory path.
-
-        Returns:
-            str: the default user directory path
-        """
-        return os.path.join(self.log_dir, "user")
-
     @property
     def interface(self):
         """Get the interface device.
@@ -352,7 +327,7 @@ def _default_interface(self, logger, hosts):
             # Find all the /sys/class/net interfaces on the launch node (excluding lo)
             logger.debug("Detecting network devices - D_INTERFACE not set")
             try:
-                interface = get_fastest_interface(logger, hosts)
+                interface = get_fastest_interface(logger, hosts | get_local_host())
             except NetworkException as error:
                 raise TestEnvironmentException("Error obtaining a default interface!") from error
         return interface
@@ -447,15 +422,6 @@ def insecure_mode(self, value):
         """
         self.__set_value('insecure_mode', value)
 
-    @staticmethod
-    def _default_insecure_mode():
-        """Get the default insecure mode.
-
-        Returns:
-            str: the default insecure mode
-        """
-        return "True"
-
     @property
     def bullseye_src(self):
         """Get the bullseye source file.
@@ -474,15 +440,6 @@ def bullseye_src(self, value):
         """
         self.__set_value('bullseye_src', value)
 
-    @staticmethod
-    def _default_bullseye_src():
-        """Get the default bullseye source file.
-
-        Returns:
-            str: the default bullseye source file
-        """
-        return os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test.cov")
-
     @property
     def bullseye_file(self):
         """Get the bullseye file.
@@ -501,15 +458,6 @@ def bullseye_file(self, value):
         """
         self.__set_value('bullseye_file', value)
 
-    @staticmethod
-    def _default_bullseye_file():
-        """Get the default bullseye file.
-
-        Returns:
-            str: the default bullseye file
-        """
-        return os.path.join(os.sep, "tmp", "test.cov")
-
     @property
     def daos_prefix(self):
         """Get the daos_prefix.
@@ -572,15 +520,6 @@ def agent_user(self, value):
         """
         self.__set_value('agent_user', value)
 
-    @staticmethod
-    def _default_agent_user():
-        """Get the default daos_agent user.
-
-        Returns:
-            str: the default daos_agent user
-        """
-        return 'root'
-
     @property
     def systemd_library_path(self):
         """Get the systemd LD_LIBRARY_PATH.
@@ -599,14 +538,71 @@ def systemd_library_path(self, value):
         """
         self.__set_value('systemd_library_path', value)
 
-    @staticmethod
-    def _default_systemd_library_path():
-        """Get the default systemd LD_LIBRARY_PATH.
+    @property
+    def control_config(self):
+        """Get the control config file used in testing.
+
+        Returns:
+            str: the control config file
+        """
+        return os.environ.get(self.__ENV_VAR_MAP['control_config'])
+
+    @control_config.setter
+    def control_config(self, value):
+        """Set the control config file used in testing.
+
+        Args:
+            value (str): the control config file
+        """
+        self.__set_value('control_config', value)
+
+    @property
+    def agent_config(self):
+        """Get the agent config file used in testing.
+
+        Returns:
+            str: the agent config file
+        """
+        return os.environ.get(self.__ENV_VAR_MAP['agent_config'])
+
+    @agent_config.setter
+    def agent_config(self, value):
+        """Set the agent config file used in testing.
+
+        Args:
+            value (str): the agent config file
+        """
+        self.__set_value('agent_config', value)
+
+    @property
+    def server_config(self):
+        """Get the server config file used in testing.
+
+        Returns:
+            str: the server config file
+        """
+        return os.environ.get(self.__ENV_VAR_MAP['server_config'])
+
+    @server_config.setter
+    def server_config(self, value):
+        """Set the server config file used in testing.
+
+        Args:
+            value (str): the server config file
+        """
+        self.__set_value('server_config', value)
+
+    def config_file_directories(self):
+        """Get the unique list of directories for the client, control, and server config files.
 
         Returns:
-            str: the default systemd LD_LIBRARY_PATH
+            list: a list of directories for the client, control, and server config files
         """
-        return None
+        directories = set()
+        directories.add(os.path.dirname(self.agent_config))
+        directories.add(os.path.dirname(self.control_config))
+        directories.add(os.path.dirname(self.server_config))
+        return list(directories)
 
 
 def set_test_environment(logger, test_env=None, servers=None, clients=None, provider=None,
diff --git a/src/tests/ftest/util/fault_config_utils.py b/src/tests/ftest/util/fault_config_utils.py
index b6d437c2e29..5dd3071271f 100644
--- a/src/tests/ftest/util/fault_config_utils.py
+++ b/src/tests/ftest/util/fault_config_utils.py
@@ -8,7 +8,7 @@
 import os
 
 import yaml
-from general_utils import distribute_files
+from file_utils import distribute_files
 from run_utils import run_local, run_remote
 
 # a lookup table of predefined faults
@@ -295,15 +295,22 @@ def start(self, fault_list, test_dir):
             # orterun or something, could re-evaluate this later
             self.write_fault_file(None)
 
-    def copy_fault_files(self, hosts):
+    def copy_fault_files(self, logger, hosts):
         """Copy the fault injection file to all test hosts.
 
         Args:
+            logger (Logger): logger for the messages produced by this method
             hosts (list): list of hosts to copy the fault injection file
+
+        Raises:
+            FaultInjectionFailed: if there is an error copying the fault injection files
         """
         if self._fault_list:
             self._hosts = hosts
-            distribute_files(self._hosts, self.fault_file, self.fault_file)
+            result = distribute_files(logger, self._hosts, self.fault_file, self.fault_file)
+            if not result.passed:
+                raise FaultInjectionFailed(
+                    f"Error copying fault injection files to {result.failed_hosts}")
 
     def stop(self):
         """Remove the fault injection file created during testing.
diff --git a/src/tests/ftest/util/file_utils.py b/src/tests/ftest/util/file_utils.py
new file mode 100644
index 00000000000..13f7e45a382
--- /dev/null
+++ b/src/tests/ftest/util/file_utils.py
@@ -0,0 +1,147 @@
+"""
+  (C) Copyright 2018-2024 Intel Corporation.
+
+  SPDX-License-Identifier: BSD-2-Clause-Patent
+"""
+
+import os
+
+# pylint: disable=import-error,no-name-in-module
+from util.host_utils import get_local_host
+from util.run_utils import command_as_user, get_clush_command, run_local, run_remote
+from util.user_utils import get_chown_command, get_primary_group
+
+
+def __run_command(logger, hosts, command, verbose=True, timeout=15):
+    """Run the command locally if there are no remote hosts or remotely.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        hosts (NodeSet): hosts on which to run the command
+        command (str): command to run
+        verbose (bool, optional): log the command output. Defaults to True.
+        timeout (int, optional): command timeout. Defaults to 15 seconds.
+
+    Returns:
+        CommandResult: groups of command results from the same hosts with the same return status
+    """
+    if not hosts.difference(get_local_host()):
+        return run_local(logger, command, verbose, timeout)
+    return run_remote(logger, hosts, command, verbose, timeout)
+
+
+def create_directory(logger, hosts, directory, timeout=15, verbose=True, user=None):
+    """Create the specified directory on the specified hosts.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        hosts (NodeSet): hosts on which to create the directory
+        directory (str): the directory to create
+        timeout (int, optional): command timeout. Defaults to 15 seconds.
+        verbose (bool, optional): log the command output. Defaults to True.
+        user (str, optional): user with which to run the command. Defaults to None.
+
+    Returns:
+        CommandResult: groups of command results from the same hosts with the same return status
+    """
+    command = command_as_user(f"mkdir -p {directory}", user)
+    return __run_command(logger, hosts, command, verbose, timeout)
+
+
+def change_file_owner(logger, hosts, filename, owner, group, timeout=15, verbose=True, user=None):
+    """Create the specified directory on the specified hosts.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        hosts (NodeSet): hosts on which to create the directory
+        filename (str): the file for which to change ownership
+        owner (str): new owner of the file
+        group (str): new group owner of the file
+        timeout (int, optional): command timeout. Defaults to 15 seconds.
+        verbose (bool, optional): log the command output. Defaults to True.
+        user (str, optional): user with which to run the command. Defaults to None.
+
+    Returns:
+        CommandResult: groups of command results from the same hosts with the same return status
+    """
+    command = command_as_user(get_chown_command(owner, group, file=filename), user)
+    return __run_command(logger, hosts, command, verbose, timeout)
+
+
+def get_file_size(logger, host, file_name):
+    """Obtain the file size on the specified host.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        host (NodeSet): host from which to get the file size
+        file_name (str): name of remote file
+
+    Returns:
+        CommandResult: groups of command results from the same hosts with the same return status
+    """
+    return __run_command(logger, host, f"stat -c%s {file_name}")
+
+
+def distribute_files(logger, hosts, source, destination, mkdir=True, timeout=60,
+                     verbose=True, sudo=False, owner=None):
+    """Copy the source to the destination on each of the specified hosts.
+
+    Optionally (by default) ensure the destination directory exists on each of
+    the specified hosts prior to copying the source.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        hosts (NodeSet): hosts on which to copy the source
+        source (str): the file to copy to the hosts
+        destination (str): the host location in which to copy the source
+        mkdir (bool, optional): whether or not to ensure the destination
+            directory exists on hosts prior to copying the source. Defaults to
+            True.
+        timeout (int, optional): command timeout. Defaults to 60 seconds.
+        verbose (bool, optional): whether to log the command run and
+            stdout/stderr. Defaults to True.
+        raise_exception (bool, optional): whether to raise an exception if the
+            command returns a non-zero exit status. Defaults to True.
+        sudo (bool, optional): whether to run the command via sudo. Defaults to
+            False.
+        owner (str, optional): if specified the owner to assign as the owner of
+            the copied file. Defaults to None.
+
+    Returns:
+        CommandResult: groups of command results from the same hosts with the same return status
+    """
+    result = None
+    if mkdir:
+        result = create_directory(logger, hosts, os.path.dirname(destination), timeout, verbose)
+
+    if result is None or result.passed:
+        if sudo:
+            # In order to copy a protected file to a remote host in CI the source will first be
+            # copied as is to the remote host
+            other_hosts = hosts.difference(get_local_host())
+            if other_hosts:
+                # Existing files with strict file permissions can cause the subsequent non-sudo
+                # copy to fail, so remove the file first
+                _rm_command = command_as_user(f"rm -f {source}", "root")
+                run_remote(logger, other_hosts, _rm_command, verbose, timeout)
+                result = distribute_files(
+                    logger, other_hosts, source, source, mkdir=True,
+                    timeout=timeout, verbose=verbose, sudo=False, owner=None)
+
+            if result is None or result.passed:
+                # Then a local sudo copy will be executed on the remote node to copy the source
+                # to the destination
+                _cp_command = command_as_user(f"cp {source} {destination}", "root")
+                result = run_remote(logger, hosts, _cp_command, verbose, timeout)
+        else:
+            # Without the sudo requirement copy the source to the destination directly with clush
+            _command = get_clush_command(hosts, args=f"-S -v --copy {source} --dest {destination}")
+            result = run_local(logger, _command, verbose, timeout)
+
+        # If requested update the ownership of the destination file
+        if owner is not None and result.passed:
+            result = change_file_owner(
+                logger, hosts, destination, owner, get_primary_group(owner), timeout, verbose,
+                "root" if sudo else None)
+
+    return result
diff --git a/src/tests/ftest/util/general_utils.py b/src/tests/ftest/util/general_utils.py
index 7ef3b73572d..84e55601ff2 100644
--- a/src/tests/ftest/util/general_utils.py
+++ b/src/tests/ftest/util/general_utils.py
@@ -16,15 +16,13 @@
 from getpass import getuser
 from importlib import import_module
 from logging import getLogger
-from socket import gethostname
 
 from avocado.core.settings import settings
 from avocado.core.version import MAJOR
 from avocado.utils import process
 from ClusterShell.NodeSet import NodeSet
 from ClusterShell.Task import task_self
-from run_utils import command_as_user, get_clush_command, run_local, run_remote
-from user_utils import get_chown_command, get_primary_group
+from run_utils import command_as_user, run_local, run_remote
 
 
 class DaosTestError(Exception):
@@ -891,177 +889,20 @@ def convert_string(item, separator=","):
     return item
 
 
-def create_directory(hosts, directory, timeout=15, verbose=True,
-                     raise_exception=True, sudo=False):
-    """Create the specified directory on the specified hosts.
-
-    Args:
-        hosts (NodeSet): hosts on which to create the directory
-        directory (str): the directory to create
-        timeout (int, optional): command timeout. Defaults to 15 seconds.
-        verbose (bool, optional): whether to log the command run and
-            stdout/stderr. Defaults to True.
-        raise_exception (bool, optional): whether to raise an exception if the
-            command returns a non-zero exit status. Defaults to True.
-        sudo (bool, optional): whether to run the command via sudo. Defaults to
-            False.
-
-    Raises:
-        DaosTestError: if there is an error running the command
-
-    Returns:
-        CmdResult: an avocado.utils.process CmdResult object containing the
-            result of the command execution.  A CmdResult object has the
-            following properties:
-                command         - command string
-                exit_status     - exit_status of the command
-                stdout          - the stdout
-                stderr          - the stderr
-                duration        - command execution time
-                interrupted     - whether the command completed within timeout
-                pid             - command's pid
-
-    """
-    mkdir_command = "/usr/bin/mkdir -p {}".format(directory)
-    command = get_clush_command(hosts, args="-S -v", command=mkdir_command, command_sudo=sudo)
-    return run_command(command, timeout=timeout, verbose=verbose, raise_exception=raise_exception)
-
-
-def change_file_owner(hosts, filename, owner, group, timeout=15, verbose=True,
-                      raise_exception=True, sudo=False):
-    """Create the specified directory on the specified hosts.
-
-    Args:
-        hosts (NodeSet): hosts on which to create the directory
-        filename (str): the file for which to change ownership
-        owner (str): new owner of the file
-        group (str): new group owner of the file
-        timeout (int, optional): command timeout. Defaults to 15 seconds.
-        verbose (bool, optional): whether to log the command run and
-            stdout/stderr. Defaults to True.
-        raise_exception (bool, optional): whether to raise an exception if the
-            command returns a non-zero exit status. Defaults to True.
-        sudo (bool, optional): whether to run the command via sudo. Defaults to
-            False.
-
-    Raises:
-        DaosTestError: if there is an error running the command
-
-    Returns:
-        CmdResult: an avocado.utils.process CmdResult object containing the
-            result of the command execution.  A CmdResult object has the
-            following properties:
-                command         - command string
-                exit_status     - exit_status of the command
-                stdout          - the stdout
-                stderr          - the stderr
-                duration        - command execution time
-                interrupted     - whether the command completed within timeout
-                pid             - command's pid
-
-    """
-    chown_command = get_chown_command(owner, group, file=filename)
-    command = get_clush_command(hosts, args="-S -v", command=chown_command, command_sudo=sudo)
-    return run_command(command, timeout=timeout, verbose=verbose, raise_exception=raise_exception)
-
-
-def distribute_files(hosts, source, destination, mkdir=True, timeout=60,
-                     verbose=True, raise_exception=True, sudo=False,
-                     owner=None):
-    """Copy the source to the destination on each of the specified hosts.
-
-    Optionally (by default) ensure the destination directory exists on each of
-    the specified hosts prior to copying the source.
-
-    Args:
-        hosts (NodeSet): hosts on which to copy the source
-        source (str): the file to copy to the hosts
-        destination (str): the host location in which to copy the source
-        mkdir (bool, optional): whether or not to ensure the destination
-            directory exists on hosts prior to copying the source. Defaults to
-            True.
-        timeout (int, optional): command timeout. Defaults to 60 seconds.
-        verbose (bool, optional): whether to log the command run and
-            stdout/stderr. Defaults to True.
-        raise_exception (bool, optional): whether to raise an exception if the
-            command returns a non-zero exit status. Defaults to True.
-        sudo (bool, optional): whether to run the command via sudo. Defaults to
-            False.
-        owner (str, optional): if specified the owner to assign as the owner of
-            the copied file. Defaults to None.
-
-    Raises:
-        DaosTestError: if there is an error running the command
-
-    Returns:
-        CmdResult: an avocado.utils.process CmdResult object containing the
-            result of the command execution.  A CmdResult object has the
-            following properties:
-                command         - command string
-                exit_status     - exit_status of the command
-                stdout          - the stdout
-                stderr          - the stderr
-                duration        - command execution time
-                interrupted     - whether the command completed within timeout
-                pid             - command's pid
-
-    """
-    result = None
-    if mkdir:
-        result = create_directory(
-            hosts, os.path.dirname(destination), verbose=verbose,
-            raise_exception=raise_exception)
-    if result is None or result.exit_status == 0:
-        if sudo:
-            # In order to copy a protected file to a remote host in CI the
-            # source will first be copied as is to the remote host
-            localhost = gethostname().split(".")[0]
-            other_hosts = NodeSet.fromlist([host for host in hosts if host != localhost])
-            if other_hosts:
-                # Existing files with strict file permissions can cause the
-                # subsequent non-sudo copy to fail, so remove the file first
-                rm_command = get_clush_command(
-                    other_hosts, args="-S -v", command="rm -f {}".format(source),
-                    command_sudo=True)
-                run_command(rm_command, verbose=verbose, raise_exception=False)
-                result = distribute_files(
-                    other_hosts, source, source, mkdir=True,
-                    timeout=timeout, verbose=verbose,
-                    raise_exception=raise_exception, sudo=False, owner=None)
-            if result is None or result.exit_status == 0:
-                # Then a local sudo copy will be executed on the remote node to
-                # copy the source to the destination
-                command = get_clush_command(
-                    hosts, args="-S -v", command="cp {} {}".format(source, destination),
-                    command_sudo=True)
-                result = run_command(command, timeout, verbose, raise_exception)
-        else:
-            # Without the sudo requirement copy the source to the destination
-            # directly with clush
-            command = get_clush_command(
-                hosts, args="-S -v --copy {} --dest {}".format(source, destination))
-            result = run_command(command, timeout, verbose, raise_exception)
-
-        # If requested update the ownership of the destination file
-        if owner is not None and result.exit_status == 0:
-            change_file_owner(
-                hosts, destination, owner, get_primary_group(owner), timeout=timeout,
-                verbose=verbose, raise_exception=raise_exception, sudo=sudo)
-    return result
-
-
-def get_default_config_file(name):
+def get_default_config_file(name, path=None):
     """Get the default config file.
 
     Args:
         name (str): daos component name, e.g. server, agent, control
+        path (str, optional): path to use for the config file. Defaults to None which will use the
+            /etc/daos default.
 
     Returns:
         str: the default config file
-
     """
-    file_name = "".join(["daos_", name, ".yml"])
-    return os.path.join(os.sep, "etc", "daos", file_name)
+    if path is None:
+        path = os.path.join(os.sep, "etc", "daos")
+    return os.path.join(path, f"daos_{name}.yml")
 
 
 def get_file_listing(hosts, files, user):
diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py
index a86df01ea55..0f7284c50ef 100644
--- a/src/tests/ftest/util/launch_utils.py
+++ b/src/tests/ftest/util/launch_utils.py
@@ -20,9 +20,10 @@
 from util.host_utils import HostException, HostInfo, get_local_host, get_node_set
 from util.logger_utils import LOG_FILE_FORMAT, get_file_handler
 from util.results_utils import LaunchTestName
-from util.run_utils import RunException, run_local, run_remote
+from util.run_utils import RunException, command_as_user, run_local, run_remote
 from util.slurm_utils import create_partition, delete_partition, show_partition
 from util.storage_utils import StorageException, StorageInfo
+from util.systemctl_utils import SystemctlFailure, create_override_config
 from util.user_utils import get_group_id, get_user_groups, groupadd, useradd, userdel
 from util.yaml_utils import YamlUpdater, get_yaml_data
 
@@ -77,6 +78,66 @@ def setup_fuse_config(logger, hosts):
             raise LaunchException(f"Failed to setup {config}")
 
 
+def __add_systemctl_override(logger, hosts, service, user, command, config, path, lib_path):
+    """Add a systemctl override file for the specified service.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        hosts (NodeSet): hosts on which to create the systemctl config
+        service (str): service for which to issue the command
+        user (str): user to use to issue the command
+        service_command (str): full path to the service command
+        service_config (str): full path to the service config
+        path (str): the PATH variable to set in the systemd config.
+        ld_library_path (str): the LD_LIBRARY_PATH variable to set in the systemd config.
+
+    Raises:
+        LaunchException: if setup fails
+
+    Returns:
+        dict: a dictionary of the systemctl override config file key with a dictionary value
+            containing the related host and user information
+    """
+    logger.debug("-" * 80)
+    logger.info("Setting up systemctl override for %s", service)
+    try:
+        systemctl_override = create_override_config(
+            logger, hosts, service, user, command, config, path, lib_path)
+    except SystemctlFailure as error:
+        raise LaunchException(f"Failed to setup systemctl config for {service}") from error
+    return {systemctl_override: {"hosts": hosts, "user": user}}
+
+
+def setup_systemctl(logger, servers, clients, test_env):
+    """Set up the systemctl override files for the daos_server and daos_agent.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        servers (NodeSet): hosts that may run the daos_server command
+        clients (NodeSet): hosts that may run the daos_agent command
+        test_env (TestEnvironment): the test environment
+
+    Raises:
+        LaunchException: if setup fails
+
+    Returns:
+        dict: a dictionary of systemctl override config file keys with NodeSet values identifying
+            the hosts on which to remove the config files at the end of testing
+    """
+    systemctl_configs = {}
+    systemctl_configs.update(
+        __add_systemctl_override(
+            logger, servers, "daos_server.service", "root",
+            os.path.join(test_env.daos_prefix, "bin", "daos_server"), test_env.server_config,
+            None, None))
+    systemctl_configs.update(
+        __add_systemctl_override(
+            logger, clients, "daos_agent.service", test_env.agent_user,
+            os.path.join(test_env.daos_prefix, "bin", "daos_agent"), test_env.agent_config,
+            None, None))
+    return systemctl_configs
+
+
 def display_disk_space(logger, path):
     """Display disk space of provided path destination.
 
@@ -544,12 +605,13 @@ def _setup_test_directory(self, logger, test):
             f"sudo -n rm -fr {test_env.log_dir}",
             f"mkdir -p {test_env.log_dir}",
             f"chmod a+wrx {test_env.log_dir}",
-            f"ls -al {test_env.log_dir}",
-            f"mkdir -p {test_env.user_dir}"
         ]
         # Predefine the sub directories used to collect the files process()/_archive_files()
+        directories = [test_env.user_dir] + test_env.config_file_directories()
         for directory in TEST_RESULTS_DIRS:
-            commands.append(f"mkdir -p {test_env.log_dir}/{directory}")
+            directories.append(os.path.join(test_env.log_dir, directory))
+        commands.append(f"mkdir -p {' '.join(directories)}")
+        commands.append(f"ls -al {test_env.log_dir}")
         for command in commands:
             if not run_remote(logger, hosts, command).passed:
                 message = "Error setting up the common test directory on all hosts"
@@ -1161,7 +1223,7 @@ def _setup_application_directory(self, logger, result):
 
     def run_tests(self, logger, result, repeat, slurm_setup, sparse, fail_fast, stop_daos, archive,
                   rename, jenkins_log, core_files, threshold, user_create, code_coverage,
-                  job_results_dir, logdir, clear_mounts):
+                  job_results_dir, logdir, clear_mounts, cleanup_files):
         # pylint: disable=too-many-arguments
         """Run all the tests.
 
@@ -1183,6 +1245,7 @@ def run_tests(self, logger, result, repeat, slurm_setup, sparse, fail_fast, stop
             job_results_dir (str): avocado job-results directory
             logdir (str): base directory in which to place the log file
             clear_mounts (list): mount points to remove before each test
+            cleanup_files (dict): files to remove on specific hosts at the end of testing
 
         Returns:
             int: status code indicating any issues running tests
@@ -1233,6 +1296,12 @@ def run_tests(self, logger, result, repeat, slurm_setup, sparse, fail_fast, stop
                 # Stop logging to the test log file
                 logger.removeHandler(test_file_handler)
 
+        # Cleanup any specified files at the end of testing
+        for file, info in cleanup_files.items():
+            command = command_as_user(f"rm -fr {file}", info['user'])
+            if not run_remote(logger, info['hosts'], command).passed:
+                return_code |= 16
+
         # Collect code coverage files after all test have completed
         if not code_coverage.finalize(logger, job_results_dir, result.tests[0]):
             return_code |= 16
diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py
index d7e1efdd58a..6ae05af94e9 100644
--- a/src/tests/ftest/util/server_utils.py
+++ b/src/tests/ftest/util/server_utils.py
@@ -457,7 +457,8 @@ def support_collect_log(self, **kwargs):
         cmd = DaosServerCommand(self.manager.job.command_path)
         cmd.run_user = "daos_server"
         cmd.debug.value = False
-        kwargs['config'] = get_default_config_file("server")
+        kwargs['config'] = get_default_config_file(
+            "server", os.path.dirname(self.manager.job.yaml.filename))
         cmd.set_command(("support", "collect-log"), **kwargs)
         self.log.info("Support collect-log on servers: %s", str(cmd))
         return run_remote(
diff --git a/src/tests/ftest/util/systemctl_utils.py b/src/tests/ftest/util/systemctl_utils.py
new file mode 100644
index 00000000000..848b7dc6d31
--- /dev/null
+++ b/src/tests/ftest/util/systemctl_utils.py
@@ -0,0 +1,249 @@
+"""
+  (C) Copyright 2018-2024 Intel Corporation.
+
+  SPDX-License-Identifier: BSD-2-Clause-Patent
+"""
+
+import getpass
+import os
+import tempfile
+
+from ClusterShell.NodeSet import NodeSet
+# pylint: disable=import-error,no-name-in-module
+from util.file_utils import create_directory, distribute_files
+from util.run_utils import command_as_user, run_remote
+
+
+class SystemctlFailure(Exception):
+    """Base exception for this module."""
+
+
+def get_service_status(logger, hosts, service, user="root"):
+    """Get the status of the daos_server.service.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        hosts (NodeSet): hosts on which to get the service state
+        service (str): name of the service
+        user (str, optional): user to use to issue the command. Defaults to "root".
+
+    Returns:
+        dict: a dictionary with the following keys:
+            - "status":       boolean set to True if status was obtained; False otherwise
+            - "stop":         NodeSet where to stop the daos_server.service
+            - "disable":      NodeSet where to disable the daos_server.service
+            - "reset-failed": NodeSet where to reset the daos_server.service
+
+    """
+    status = {
+        "status": True,
+        "stop": NodeSet(),
+        "disable": NodeSet(),
+        "reset-failed": NodeSet()}
+    status_states = {
+        "stop": ["active", "activating", "deactivating"],
+        "disable": ["active", "activating", "deactivating"],
+        "reset-failed": ["failed"]}
+    command = get_systemctl_command("is-active", service, user)
+    result = run_remote(logger, hosts, command, False)
+    for data in result.output:
+        if data.timeout:
+            status["status"] = False
+            status["stop"].add(data.hosts)
+            status["disable"].add(data.hosts)
+            status["reset-failed"].add(data.hosts)
+            logger.debug("  %s: TIMEOUT", data.hosts)
+            break
+        logger.debug("  %s: %s", data.hosts, "\n".join(data.stdout))
+        for key, state_list in status_states.items():
+            for line in data.stdout:
+                if line in state_list:
+                    status[key].add(data.hosts)
+                    break
+    return status
+
+
+def stop_service(logger, hosts, service, user="root"):
+    """Stop any daos_server.service running on the hosts running servers.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        hosts (NodeSet): list of hosts on which to stop the service.
+        service (str): name of the service
+        user (str, optional): user to use to issue the command. Defaults to "root".
+
+    Returns:
+        bool: True if the service was successfully stopped; False otherwise
+
+    """
+    if not hosts:
+        logger.debug("  Skipping stopping %s service - no hosts", service)
+        return True
+
+    result = {"status": True}
+    status_keys = ["reset-failed", "stop", "disable"]
+    mapping = {"stop": "active", "disable": "enabled", "reset-failed": "failed"}
+    check_hosts = NodeSet(hosts)
+    loop = 1
+    # Reduce 'max_loops' to 2 once https://jira.hpdd.intel.com/browse/DAOS-7809 has been resolved
+    max_loops = 3
+    while check_hosts:
+        # Check the status of the service on each host
+        result = get_service_status(logger, check_hosts, service)
+        check_hosts = NodeSet()
+        for key in status_keys:
+            if result[key]:
+                if loop == max_loops:
+                    # Exit the while loop if the service is still running
+                    logger.error(
+                        " - Error %s still %s on %s", service, mapping[key], result[key])
+                    result["status"] = False
+                else:
+                    # Issue the appropriate systemctl command to remedy the
+                    # detected state, e.g. 'stop' for 'active'.
+                    command = command_as_user(get_systemctl_command(key, service, user), user)
+                    run_remote(logger, result[key], command)
+
+                    # Run the status check again on this group of hosts
+                    check_hosts.add(result[key])
+        loop += 1
+
+    return result["status"]
+
+
+def get_systemctl_command(unit_command, service, user="root"):
+    """Get the systemctl command for the specified inputs.
+
+    Args:
+        unit_command (str): command to issue for the service
+        service (str): service for which to issue the command
+        user (str, optional): user to use to issue the command. Defaults to "root".
+
+    Returns:
+        str: the systemctl command for the specified service and user
+    """
+    command = ["systemctl"]
+    if user != "root":
+        command.append(f"--user {user}")
+    if unit_command:
+        command.append(unit_command)
+    if service:
+        command.append(service)
+    return " ".join(command)
+
+
+def get_service_file(logger, hosts, service, user, verbose=True, timeout=120):
+    """Get the service file.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        hosts (NodeSet): hosts on which to run the command
+        service (str): service for which to issue the command
+        user (str, optional): user to use to issue the command. Defaults to "root".
+        verbose (bool, optional): log the command output. Defaults to True.
+        timeout (int, optional): number of seconds to wait for the command to complete.
+            Defaults to 120 seconds.
+
+    Raises:
+        SystemctlFailure: if there is a problem obtaining the service file
+
+    Returns:
+        str: the service file
+    """
+    command = ' | '.join([
+        get_systemctl_command("status", service, user),
+        "grep 'Loaded:'",
+        "grep -oE '/.*service'",
+        "xargs sh -c '[ -e \"$0\" ] && echo \"$0\"'"
+    ])
+    result = run_remote(logger, hosts, command, verbose, timeout)
+    if not result.passed:
+        raise SystemctlFailure("Error obtaining the service file path")
+    if not result.homogeneous:
+        raise SystemctlFailure("Error obtaining a homogeneous service file path")
+    return list(result.all_stdout.values())[0].strip()
+
+
+def create_override_config(logger, hosts, service, user, service_command, service_config, path,
+                           ld_library_path, verbose=True, timeout=120):
+    """Create a systemctl override config file.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        hosts (NodeSet): hosts on which to run the command
+        service (str): service for which to issue the command
+        user (str): user to use to issue the command
+        service_command (str): full path to the service command
+        service_config (str): full path to the service config
+        path (str): the PATH variable to set in the systemd config.
+        ld_library_path (str): the LD_LIBRARY_PATH variable to set in the systemd config.
+        verbose (bool, optional): log the command output. Defaults to True.
+        timeout (int, optional): number of seconds to wait for the command to complete.
+            Defaults to 120 seconds.
+
+    Raises:
+        SystemctlFailure: if there are problems detecting, creating, or distributing the systemctl
+            override config file
+
+    Returns:
+        str: the systemctl override config file path
+    """
+    # Get the existing service file
+    service_file = get_service_file(logger, hosts, service, user, verbose, timeout)
+
+    # Create the override directory
+    override_file = os.path.join(f"{service_file}.d", "override.conf")
+    result = create_directory(logger, hosts, os.path.dirname(override_file), timeout, verbose, user)
+    if not result.passed:
+        raise SystemctlFailure("Error creating the systemctl override config directory")
+
+    # Create the override file - empty ExecStart clears the existing setting
+    override_contents = [
+        "[Service]",
+        "ExecStart=",
+        f"ExecStart={service_command} start -o {service_config}"
+    ]
+    if path:
+        override_contents.append(f'Environment="PATH={path}"')
+    if ld_library_path:
+        override_contents.append(f'Environment="LD_LIBRARY_PATH={ld_library_path}"')
+    override_contents = "\n".join(override_contents) + "\n"
+
+    with tempfile.NamedTemporaryFile() as temp:
+        temp.write(bytes(override_contents, encoding='utf-8'))
+        temp.flush()
+        os.chmod(temp.name, 0o644)
+
+        _sudo = user != getpass.getuser()
+        _owner = user if _sudo else None
+
+        result = distribute_files(
+            logger, hosts, temp.name, override_file, mkdir=False, verbose=verbose, sudo=_sudo,
+            owner=_owner)
+        if not result.passed:
+            raise SystemctlFailure(
+                "Error distributing the systemctl override config directory")
+
+    # Reload on all hosts to pick up changes
+    if not daemon_reload(logger, hosts, user, verbose, timeout).passed:
+        raise SystemctlFailure("Error reloading systemctl daemon with override config directory")
+
+    return override_file
+
+
+def daemon_reload(logger, hosts, user, verbose=True, timeout=120):
+    """Run systemctl daemon-reload.
+
+    Args:
+        logger (Logger): logger for the messages produced by this method
+        hosts (NodeSet): hosts on which to run the command
+        user (str, optional): user to use to issue the command. Defaults to "root".
+        verbose (bool, optional): log the command output. Defaults to True.
+        timeout (int, optional): number of seconds to wait for the command to complete.
+            Defaults to 120 seconds.
+
+    Returns:
+        CommandResult: groups of command results from the same hosts with the same return status
+    """
+    command = get_systemctl_command("daemon-reload", None, user)
+    return run_remote(logger, hosts, command_as_user(command, user), verbose, timeout)