Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16366 test: Use agent/server config files from test directory (#14944) #15033

Merged
merged 4 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions src/tests/ftest/daos_test/dfuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from apricot import TestWithServers
from cmocka_utils import CmockaUtils
from dfuse_utils import get_dfuse, start_dfuse
from general_utils import create_directory, get_log_file
from file_utils import create_directory
from general_utils import get_log_file
from job_manager_utils import get_job_manager


Expand Down Expand Up @@ -80,7 +81,9 @@ def run_test(self, il_lib=None):
else:
# Bypass, simply create a remote directory and use that.
mount_dir = '/tmp/dfuse-test'
create_directory(self.hostlist_clients, mount_dir)
result = create_directory(self.log, self.hostlist_clients, mount_dir)
if not result.passed:
self.fail(f"Error creating {mount_dir} on {result.failed_hosts}")

cmocka_utils = CmockaUtils(
self.hostlist_clients, "dfuse", self.outputdir, self.test_dir, self.log)
Expand Down Expand Up @@ -118,7 +121,9 @@ def run_test(self, il_lib=None):
else:
# make D_IL_MOUNT_POINT different from mount_dir so it tests a non-DAOS filesystem
dummy_dir = '/tmp/dummy'
create_directory(self.hostlist_clients, dummy_dir)
result = create_directory(self.log, self.hostlist_clients, dummy_dir)
if not result.passed:
self.fail(f"Error creating {dummy_dir} on {result.failed_hosts}")
daos_test_env['D_IL_MOUNT_POINT'] = dummy_dir
if cache_mode != 'writeback':
command.append('--metadata')
Expand Down
17 changes: 14 additions & 3 deletions src/tests/ftest/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
from util.code_coverage_utils import CodeCoverage
from util.environment_utils import TestEnvironment, TestEnvironmentException, set_test_environment
from util.host_utils import get_local_host
from util.launch_utils import LaunchException, TestGroup, setup_fuse_config, summarize_run
from util.launch_utils import (LaunchException, TestGroup, setup_fuse_config, setup_systemctl,
summarize_run)
from util.logger_utils import LOG_FILE_FORMAT, get_console_handler, get_file_handler
from util.network_utils import PROVIDER_ALIAS, SUPPORTED_PROVIDERS
from util.package_utils import find_packages
Expand Down Expand Up @@ -270,7 +271,8 @@ def _run(self, args):
# pylint: disable=unsupported-binary-operation
all_hosts = args.test_servers | args.test_clients | self.local_host
self.details["installed packages"] = find_packages(
logger, all_hosts, "'^(daos|libfabric|mercury|ior|openmpi|mpifileutils)-'")
logger, all_hosts,
"'^(daos|libfabric|mercury|ior|openmpi|mpifileutils|mlnx-ofed-basic)-'")

# Setup the test environment
test_env = TestEnvironment()
Expand Down Expand Up @@ -325,6 +327,15 @@ def _run(self, args):
message = "Issue detected setting up the fuse configuration"
setup_result.warn_test(logger, "Setup", message, sys.exc_info())

# Setup override systemctl files
try:
clients = args.test_clients if args.test_clients else args.test_servers
cleanup_files = setup_systemctl(
logger, args.test_servers, clients | self.local_host, test_env)
except LaunchException:
message = "Issue detected setting up the systemctl configuration"
return self.get_exit_status(1, message, "Setup", sys.exc_info())

# Get the core file pattern information
core_files = {}
if args.process_cores:
Expand Down Expand Up @@ -370,7 +381,7 @@ def _run(self, args):
logger, self.result, self.repeat, self.slurm_setup, args.sparse, args.failfast,
not args.disable_stop_daos, args.archive, args.rename, args.jenkinslog, core_files,
args.logs_threshold, args.user_create, code_coverage, self.job_results_dir,
self.logdir, args.clear_mounts)
self.logdir, args.clear_mounts, cleanup_files)

# Convert the test status to a launch.py status
status |= summarize_run(logger, self.mode, test_status)
Expand Down
7 changes: 4 additions & 3 deletions src/tests/ftest/pool/destroy.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,10 +362,11 @@ def test_destroy_wrong_group(self):
server_group_b = self.server_group + "_b"

# Prepare and configure dmg config files for a and b.
dmg_config_file_a = get_default_config_file(name="control_a")
config_path = os.path.dirname(self.test_env.control_config)
dmg_config_file_a = get_default_config_file(name="control_a", path=config_path)
dmg_config_temp_a = self.get_config_file(
name=server_group_a, command="dmg", path=self.test_dir)
dmg_config_file_b = get_default_config_file(name="control_b")
dmg_config_file_b = get_default_config_file(name="control_b", path=config_path)
dmg_config_temp_b = self.get_config_file(
name=server_group_b, command="dmg", path=self.test_dir)

Expand Down Expand Up @@ -393,7 +394,7 @@ def test_destroy_wrong_group(self):

# Get dmg_c instance that uses daos_control_c.yml. Server group is b.
cert_dir = os.path.join(os.sep, "etc", "daos", "certs")
dmg_config_file_c = get_default_config_file(name="control_c")
dmg_config_file_c = get_default_config_file(name="control_c", path=config_path)
dmg_config_temp_c = self.get_config_file(
name=server_group_b, command="dmg", path=self.test_dir)
dmg_c = get_dmg_command(
Expand Down
17 changes: 7 additions & 10 deletions src/tests/ftest/recovery/ddb.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
from ClusterShell.NodeSet import NodeSet
from ddb_utils import DdbCommand
from exception_utils import CommandFailure
from general_utils import (DaosTestError, create_string_buffer, distribute_files,
get_clush_command, get_random_string, report_errors, run_command)
from file_utils import distribute_files
from general_utils import (DaosTestError, create_string_buffer, get_random_string, report_errors,
run_command)
from pydaos.raw import DaosObjClass, IORequest
from recovery_test_base import RecoveryTestBase
from run_utils import get_clush_command


def insert_objects(context, container, object_count, dkey_count, akey_count, base_dkey,
Expand Down Expand Up @@ -507,14 +509,9 @@ def test_recovery_ddb_load(self):
file.write(new_data)

# Copy the created file to server node.
try:
distribute_files(
hosts=host, source=load_file_path, destination=load_file_path,
mkdir=False)
except DaosTestError as error:
raise CommandFailure(
"ERROR: Copying new_data.txt to {0}: {1}".format(host, error)) \
from error
result = distribute_files(self.log, host, load_file_path, load_file_path, False)
if not result.passed:
raise CommandFailure(f"ERROR: Copying new_data.txt to {result.failed_hosts}")

# The file with the new data is ready. Run ddb load.
ddb_command.value_load(component_path="[0]/[0]/[0]/[0]", load_file_path=load_file_path)
Expand Down
6 changes: 3 additions & 3 deletions src/tests/ftest/util/agent_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import os
import re
import socket
from getpass import getuser

from agent_utils_params import DaosAgentTransportCredentials, DaosAgentYamlParameters
from ClusterShell.NodeSet import NodeSet
Expand Down Expand Up @@ -289,7 +288,7 @@ def start(self):

# Verify the socket directory exists when using a non-systemctl manager
if self.verify_socket_dir:
self.verify_socket_directory(getuser())
self.verify_socket_directory(self.manager.job.certificate_owner)

super().start()

Expand Down Expand Up @@ -319,7 +318,8 @@ def support_collect_log(self, **kwargs):
"""
cmd = self.manager.job.copy()
cmd.debug.value = False
cmd.config.value = get_default_config_file("agent")
cmd.config.value = get_default_config_file(
"agent", os.path.dirname(self.manager.job.yaml.filename))
cmd.set_command(("support", "collect-log"), **kwargs)
self.log.info("Support collect-log on clients: %s", str(cmd))
return run_remote(self.log, self.hosts, cmd.with_exports)
Expand Down
21 changes: 12 additions & 9 deletions src/tests/ftest/util/apricot/apricot/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@
from exception_utils import CommandFailure
from fault_config_utils import FaultInjection
from general_utils import (dict_to_str, dump_engines_stacks, get_avocado_config_value,
get_default_config_file, get_file_listing, nodeset_append_suffix,
set_avocado_config_value)
nodeset_append_suffix, set_avocado_config_value)
from host_utils import HostException, HostInfo, HostRole, get_host_parameters, get_local_host
from logger_utils import TestLogger
from pydaos.raw import DaosApiError, DaosContext, DaosLog
Expand Down Expand Up @@ -762,13 +761,17 @@ def setUp(self):
self.fail(f"Error creating test-specific temporary directory on {result.failed_hosts}")

# Copy the fault injection files to the hosts.
self.fault_injection.copy_fault_files(self.host_info.all_hosts)
self.fault_injection.copy_fault_files(self.log, self.host_info.all_hosts)

# List common test directory contents before running the test
self.log.info("-" * 100)
self.log.debug("Common test directory (%s) contents:", self.test_dir)
self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir))
all_hosts = include_local_host(self.host_info.all_hosts)
get_file_listing(all_hosts, self.test_dir, self.test_env.agent_user).log_output(self.log)
test_dir_parent = os.path.dirname(self.test_dir)
result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}")
if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90:
run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*")
self.log.info("-" * 100)

if not self.start_servers_once or self.name.uid == 1:
# Kill commands left running on the hosts (from a previous test)
Expand Down Expand Up @@ -1063,7 +1066,7 @@ def add_agent_manager(self, group=None, config_file=None, config_temp=None):
if group is None:
group = self.server_group
if config_file is None and self.agent_manager_class == "Systemctl":
config_file = get_default_config_file("agent")
config_file = self.test_env.agent_config
config_temp = self.get_config_file(group, "agent", self.test_dir)
elif config_file is None:
config_file = self.get_config_file(group, "agent")
Expand Down Expand Up @@ -1113,14 +1116,14 @@ def add_server_manager(self, group=None, svr_config_file=None,
if group is None:
group = self.server_group
if svr_config_file is None and self.server_manager_class == "Systemctl":
svr_config_file = get_default_config_file("server")
svr_config_file = self.test_env.server_config
svr_config_temp = self.get_config_file(
group, "server", self.test_dir)
elif svr_config_file is None:
svr_config_file = self.get_config_file(group, "server")
svr_config_temp = None
if dmg_config_file is None and self.server_manager_class == "Systemctl":
dmg_config_file = get_default_config_file("control")
dmg_config_file = self.test_env.control_config
dmg_config_temp = self.get_config_file(group, "dmg", self.test_dir)
elif dmg_config_file is None:
dmg_config_file = self.get_config_file(group, "dmg")
Expand Down Expand Up @@ -1669,7 +1672,7 @@ def get_dmg_command(self, index=0):
return self.server_managers[index].dmg

if self.server_manager_class == "Systemctl":
dmg_config_file = get_default_config_file("control")
dmg_config_file = self.test_env.control_config
dmg_config_temp = self.get_config_file("daos", "dmg", self.test_dir)
dmg_cert_dir = os.path.join(os.sep, "etc", "daos", "certs")
else:
Expand Down
109 changes: 10 additions & 99 deletions src/tests/ftest/util/collection_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from util.environment_utils import TestEnvironment
from util.host_utils import get_local_host
from util.run_utils import find_command, run_local, run_remote, stop_processes
from util.systemctl_utils import stop_service
from util.user_utils import get_chown_command
from util.yaml_utils import get_test_category

Expand Down Expand Up @@ -66,97 +67,6 @@ def stop_daos_server_service(logger, test):
return stop_service(logger, hosts, service)


def stop_service(logger, hosts, service):
"""Stop any daos_server.service running on the hosts running servers.

Args:
logger (Logger): logger for the messages produced by this method
hosts (NodeSet): list of hosts on which to stop the service.
service (str): name of the service

Returns:
bool: True if the service was successfully stopped; False otherwise

"""
result = {"status": True}
if hosts:
status_keys = ["reset-failed", "stop", "disable"]
mapping = {"stop": "active", "disable": "enabled", "reset-failed": "failed"}
check_hosts = NodeSet(hosts)
loop = 1
# Reduce 'max_loops' to 2 once https://jira.hpdd.intel.com/browse/DAOS-7809
# has been resolved
max_loops = 3
while check_hosts:
# Check the status of the service on each host
result = get_service_status(logger, check_hosts, service)
check_hosts = NodeSet()
for key in status_keys:
if result[key]:
if loop == max_loops:
# Exit the while loop if the service is still running
logger.error(
" - Error %s still %s on %s", service, mapping[key], result[key])
result["status"] = False
else:
# Issue the appropriate systemctl command to remedy the
# detected state, e.g. 'stop' for 'active'.
command = ["sudo", "-n", "systemctl", key, service]
run_remote(logger, result[key], " ".join(command))

# Run the status check again on this group of hosts
check_hosts.add(result[key])
loop += 1
else:
logger.debug(" Skipping stopping %s service - no hosts", service)

return result["status"]


def get_service_status(logger, hosts, service):
"""Get the status of the daos_server.service.

Args:
logger (Logger): logger for the messages produced by this method
hosts (NodeSet): hosts on which to get the service state
service (str): name of the service

Returns:
dict: a dictionary with the following keys:
- "status": boolean set to True if status was obtained; False otherwise
- "stop": NodeSet where to stop the daos_server.service
- "disable": NodeSet where to disable the daos_server.service
- "reset-failed": NodeSet where to reset the daos_server.service

"""
status = {
"status": True,
"stop": NodeSet(),
"disable": NodeSet(),
"reset-failed": NodeSet()}
status_states = {
"stop": ["active", "activating", "deactivating"],
"disable": ["active", "activating", "deactivating"],
"reset-failed": ["failed"]}
command = ["systemctl", "is-active", service]
result = run_remote(logger, hosts, " ".join(command))
for data in result.output:
if data.timeout:
status["status"] = False
status["stop"].add(data.hosts)
status["disable"].add(data.hosts)
status["reset-failed"].add(data.hosts)
logger.debug(" %s: TIMEOUT", data.hosts)
break
logger.debug(" %s: %s", data.hosts, "\n".join(data.stdout))
for key, state_list in status_states.items():
for line in data.stdout:
if line in state_list:
status[key].add(data.hosts)
break
return status


def reset_server_storage(logger, test):
"""Reset the server storage for the hosts that ran servers in the test.

Expand Down Expand Up @@ -981,14 +891,15 @@ def collect_test_result(logger, test, test_result, job_results_dir, stop_daos, a
"depth": 1,
"timeout": 300,
}
remote_files["remote configuration files"] = {
"source": os.path.join(os.sep, "etc", "daos"),
"destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[0]),
"pattern": "daos_*.yml",
"hosts": test.host_info.all_hosts,
"depth": 1,
"timeout": 300,
}
for index, source in enumerate(test_env.config_file_directories()):
remote_files[f"remote configuration files ({index})"] = {
"source": source,
"destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[0]),
"pattern": "daos_*.yml",
"hosts": test.host_info.all_hosts,
"depth": 1,
"timeout": 300,
}
remote_files["daos log files"] = {
"source": test_env.log_dir,
"destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[1]),
Expand Down
Loading
Loading