From 007a3111ff334bc6d759258faef6b7859622df01 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 4 Apr 2024 18:09:06 -0500 Subject: [PATCH 001/101] Redirect and bring up to master dragon --- smartsim/_core/control/controller.py | 2 - smartsim/_core/entrypoints/dragon.py | 21 +- smartsim/_core/entrypoints/redis.py | 50 ++-- .../_core/launcher/dragon/dragonBackend.py | 74 +++++- .../_core/launcher/dragon/dragonLauncher.py | 131 ++++----- smartsim/_core/utils/telemetry/telemetry.py | 3 - smartsim/database/orchestrator.py | 5 +- smartsim/experiment.py | 8 + tests/test_dragon_launcher.py | 248 +----------------- 9 files changed, 172 insertions(+), 370 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index b77698663..aabbad71f 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -118,8 +118,6 @@ def start( The controller will start the job-manager thread upon execution of all jobs. """ - if isinstance(self._launcher, DragonLauncher): - self._launcher.connect_to_dragon(exp_path) self._jobs.kill_on_interrupt = kill_on_interrupt # register custom signal handler for ^C (SIGINT) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 2d1fd1c52..4706e8c5e 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -35,7 +35,6 @@ from types import FrameType import zmq -import zmq.auth.thread from smartsim._core.launcher.dragon import dragonSockets from smartsim._core.launcher.dragon.dragonBackend import DragonBackend @@ -86,10 +85,7 @@ def print_summary(network_interface: str, ip_address: str) -> None: def run( - dragon_head_address: str, - dragon_pid: int, - zmq_context: zmq.Context[t.Any], - zmq_authenticator: zmq.auth.thread.ThreadAuthenticator, + dragon_head_address: str, dragon_pid: int, zmq_context: zmq.Context[t.Any] ) -> None: logger.debug(f"Opening socket {dragon_head_address}") @@ -97,10 +93,7 @@ def run( zmq_context.setsockopt(zmq.RCVTIMEO, value=1000) zmq_context.setsockopt(zmq.REQ_CORRELATE, 1) zmq_context.setsockopt(zmq.REQ_RELAXED, 1) - - dragon_head_socket, zmq_authenticator = dragonSockets.get_secure_socket( - context, zmq.REP, True, zmq_authenticator - ) + dragon_head_socket = zmq_context.socket(zmq.REP) dragon_head_socket.bind(dragon_head_address) dragon_backend = DragonBackend(pid=dragon_pid) @@ -124,8 +117,8 @@ def run( except zmq.Again: logger.error("Could not send response back to launcher.") - dragon_backend.print_status() dragon_backend.update() + dragon_backend.print_status() if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): logger.debug(f"Listening to {dragon_head_address}") else: @@ -148,9 +141,7 @@ def main(args: argparse.Namespace, zmq_context: zmq.Context[t.Any]) -> int: else: dragon_head_address += ":5555" - launcher_socket, authenticator = dragonSockets.get_secure_socket( - context, zmq.REQ, False - ) + launcher_socket = zmq_context.socket(zmq.REQ) launcher_socket.connect(args.launching_address) client = dragonSockets.as_client(launcher_socket) @@ -167,14 +158,10 @@ def main(args: argparse.Namespace, zmq_context: zmq.Context[t.Any]) -> int: dragon_head_address=dragon_head_address, dragon_pid=response.dragon_pid, zmq_context=zmq_context, - zmq_authenticator=authenticator, ) except Exception as e: logger.error(f"Dragon server failed with {e}", exc_info=True) return os.EX_SOFTWARE - finally: - if authenticator.is_alive(): - authenticator.stop() logger.info("Shutting down! Bye bye!") return 0 diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index be2c54749..310a4fb66 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -113,28 +113,18 @@ def main(args: argparse.Namespace) -> int: *build_bind_args(src_addr, *bind_addrs), ] - # Prevent redirection of stdout and stderr - with ( - open(shard_data.name + ".out", "w", encoding="utf-8") - if args.redirect_output - else sys.stdout - ) as sys.stdout, ( - open(shard_data.name + ".err", "w", encoding="utf-8") - if args.redirect_output - else sys.stderr - ) as sys.stderr: - print_summary(cmd, args.ifname, shard_data) - - try: - process = psutil.Popen(cmd, stdout=PIPE, stderr=STDOUT) - DBPID = process.pid - - for line in iter(process.stdout.readline, b""): - print(line.decode("utf-8").rstrip(), flush=True) - except Exception as e: - cleanup() - raise SSInternalError("Database process starter raised an exception") from e - return 0 + print_summary(cmd, args.ifname, shard_data) + + try: + process = psutil.Popen(cmd, stdout=PIPE, stderr=STDOUT) + DBPID = process.pid + + for line in iter(process.stdout.readline, b""): + print(line.decode("utf-8").rstrip(), flush=True) + except Exception as e: + cleanup() + raise SSInternalError("Database process starter raised an exception") from e + return 0 def cleanup() -> None: @@ -191,14 +181,14 @@ def cleanup() -> None: action="store_true", help="Specify if this orchestrator shard is part of a cluster", ) - parser.add_argument( - "+redirect_output", - action="store_true", - help=( - "Specify if stdout and stderr of this script should be redirected. " - + "Only needed for dragon launcher." - ), - ) + # parser.add_argument( + # "+redirect_output", + # action="store_true", + # help=( + # "Specify if stdout and stderr of this script should be redirected. " + # + "Only needed for dragon launcher." + # ), + # ) args_ = parser.parse_args() # make sure to register the cleanup before the start diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index f04f9e39e..943452e4a 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -32,13 +32,12 @@ # pylint: disable=import-error # isort: off +from dragon.infrastructure.connection import Connection from dragon.infrastructure.policy import Policy -from dragon.native.process import Process, TemplateProcess +from dragon.native.process import Process, ProcessTemplate, Popen from dragon.native.process_group import ( ProcessGroup, DragonProcessGroupError, - Error, - Running, ) from dragon.native.machine import System, Node @@ -61,8 +60,8 @@ from smartsim._core.utils.helpers import create_short_id_str from smartsim.status import TERMINAL_STATUSES, SmartSimStatus -DRG_ERROR_STATUS = str(Error()) -DRG_RUNNING_STATUS = str(Running()) +DRG_ERROR_STATUS = "Error" +DRG_RUNNING_STATUS = "Running" @dataclass @@ -78,6 +77,27 @@ def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.Optional[t.List[int]]]: return (self.status, self.return_codes) +# Thanks to Colin Wahl from HPE HPC Dragon Team +def redir_worker(io_conn: Connection, file_path: str) -> None: + """Read stdout/stderr from the Dragon connection. + + :param io_conn: Dragon connection to stdout or stderr + :type io_conn: Connection + :param file_path: path to file to write to + :type file_path: str + """ + file_to_write = open(file_path, "a") + try: + while True: + output = io_conn.recv() + print(output, flush=True, file=file_to_write, end="") + except EOFError: + pass + finally: + io_conn.close() + file_to_write.close() + + class DragonBackend: """The DragonBackend class is the main interface between SmartSim and Dragon. It is not intended to be user-facing, @@ -159,8 +179,9 @@ def _allocate_step( def _get_new_id(self) -> str: with self._step_id_lock: + step_id = create_short_id_str() + "-" + str(self._step_id) self._step_id += 1 - return create_short_id_str() + "-" + str(self._step_id) + return step_id @functools.singledispatchmethod # Deliberately suppressing errors so that overloads have the same signature @@ -202,26 +223,29 @@ def update(self) -> None: restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy ) + policies = [] for node_name in hosts[: request.nodes]: local_policy = Policy( placement=Policy.Placement.HOST_NAME, host_name=node_name ) - tmp_proc = TemplateProcess( + policies.extend([local_policy] * request.tasks_per_node) + tmp_proc = ProcessTemplate( target=request.exe, args=request.exe_args, cwd=request.path, env={**request.current_env, **request.env}, - # stdout=Popen.PIPE, - # stderr=Popen.PIPE, + stdout=Popen.PIPE, + stderr=Popen.PIPE, policy=local_policy, ) grp.add_process(nproc=request.tasks_per_node, template=tmp_proc) grp.init() grp.start() + puids = grp.puids self._group_infos[step_id] = ProcessGroupInfo( process_group=grp, - puids=grp.puids, + puids=puids, return_codes=[], status=SmartSimStatus.STATUS_RUNNING, hosts=hosts, @@ -229,6 +253,33 @@ def update(self) -> None: self._running_steps.append(step_id) started.append(step_id) + try: + grp_redir = ProcessGroup(restart=False, policy=global_policy) + for pol, puid in zip(policies, puids): + proc = Process(None, ident=puid) + grp_redir.add_process( + nproc=1, + template=ProcessTemplate( + target=redir_worker, + args=(proc.stdout_conn, request.output_file), + stdout=Popen.DEVNULL, + policy=pol, + ), + ) + grp_redir.add_process( + nproc=1, + template=ProcessTemplate( + target=redir_worker, + args=(proc.stderr_conn, request.error_file), + stdout=Popen.DEVNULL, + policy=pol, + ), + ) + grp_redir.init() + grp_redir.start() + except Exception as e: + raise IOError("Could not redirect output") from e + if started: print(f"{self._updates}: {started=}") @@ -249,7 +300,8 @@ def update(self) -> None: group_info.return_codes = [ Process(None, ident=puid).returncode for puid in puids ] - except (ValueError, TypeError): + except (ValueError, TypeError) as e: + print(e) group_info.return_codes = [-1 for _ in puids] else: group_info.return_codes = [0] diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index d6f2d5901..1a2d5ad46 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -39,9 +39,7 @@ from pathlib import Path from threading import RLock -import psutil import zmq -import zmq.auth.thread from smartsim._core.launcher.dragon import dragonSockets from smartsim.error.errors import SmartSimError @@ -77,6 +75,8 @@ DRG_LOCK = RLock() DRG_CTX = zmq.Context() +DRG_CTX.setsockopt(zmq.REQ_CORRELATE, 1) +DRG_CTX.setsockopt(zmq.REQ_RELAXED, 1) class DragonLauncher(WLMLauncher): @@ -103,16 +103,25 @@ def __init__(self) -> None: # Returned by dragon head, useful if shutdown is to be requested # but process was started by another launcher self._dragon_head_pid: t.Optional[int] = None - self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None + self._dragon_server_path = os.getenv( + "SMARTSIM_DRAGON_SERVER_PATH_EXP", + os.getenv("SMARTSIM_DRAGON_SERVER_PATH", None), + ) + if self._dragon_server_path is None: + raise SmartSimError( + "Dragon server path was not set. " + "This should not happen if the launcher was started by an experiment.\n" + "If the DragonLauncher was started manually, " + "then the environment variable SMARTSIM_DRAGON_SERVER_PATH " + "should be set to an existing directory." + ) @property def is_connected(self) -> bool: return self._dragon_head_socket is not None def _handshake(self, address: str) -> None: - self._dragon_head_socket, self._authenticator = dragonSockets.get_secure_socket( - self._context, zmq.REQ, False, self._authenticator - ) + self._dragon_head_socket = self._context.socket(zmq.REQ) self._dragon_head_socket.connect(address) try: dragon_handshake = _assert_schema_type( @@ -134,19 +143,20 @@ def _set_timeout(self, timeout: int) -> None: self._context.setsockopt(zmq.SNDTIMEO, value=timeout) self._context.setsockopt(zmq.RCVTIMEO, value=timeout) - def connect_to_dragon(self, path: t.Union[str, "os.PathLike[str]"]) -> None: - self._connect_to_dragon(path) + def ensure_connected(self) -> None: + if not self.is_connected: + self._connect_to_dragon() if not self.is_connected: raise LauncherError("Could not connect to Dragon server") # pylint: disable-next=too-many-statements,too-many-locals - def _connect_to_dragon(self, path: t.Union[str, "os.PathLike[str]"]) -> None: + def _connect_to_dragon(self) -> None: with DRG_LOCK: # TODO use manager instead if self.is_connected: return - path = _resolve_dragon_path(path) + path = _resolve_dragon_path(self._dragon_server_path) dragon_config_log = path / CONFIG.dragon_log_filename if dragon_config_log.is_file(): @@ -185,10 +195,7 @@ def _connect_to_dragon(self, path: t.Union[str, "os.PathLike[str]"]) -> None: launcher_socket: t.Optional[zmq.Socket[t.Any]] = None if address is not None: self._set_timeout(self._startup_timeout) - - launcher_socket, self._authenticator = dragonSockets.get_secure_socket( - self._context, zmq.REP, True, self._authenticator - ) + launcher_socket = self._context.socket(zmq.REP) # find first available port >= 5995 port = find_free_port(start=5995) @@ -263,7 +270,6 @@ def log_dragon_outputs() -> None: _dragon_cleanup, server_socket=server_socket, server_process_pid=server_process_pid, - server_authenticator=self._authenticator, ) else: # TODO parse output file @@ -271,11 +277,11 @@ def log_dragon_outputs() -> None: raise LauncherError("Could not receive address of Dragon head process") def cleanup(self) -> None: - _dragon_cleanup( - server_socket=self._dragon_head_socket, - server_process_pid=self._dragon_head_pid, - server_authenticator=self._authenticator, - ) + if self._dragon_head_socket is not None and self._dragon_head_pid is not None: + _dragon_cleanup( + server_socket=self._dragon_head_socket, + server_process_pid=self._dragon_head_pid, + ) # RunSettings types supported by this launcher @property @@ -293,9 +299,6 @@ def run(self, step: Step) -> t.Optional[str]: :rtype: str """ - if not self.is_connected: - raise LauncherError("Dragon environment not connected") - if not self.task_manager.actively_monitoring: self.task_manager.start() @@ -306,6 +309,7 @@ def run(self, step: Step) -> t.Optional[str]: out, err = step.get_output_files() if isinstance(step, DragonStep): + self.ensure_connected() run_args = step.run_settings.run_args env = step.run_settings.env_vars nodes = int(run_args.get("nodes", None) or 1) @@ -328,7 +332,7 @@ def run(self, step: Step) -> t.Optional[str]: DragonRunResponse, ) step_id = task_id = str(response.step_id) - elif isinstance(step, LocalStep): + else: # pylint: disable-next=consider-using-with out_strm = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with @@ -336,11 +340,6 @@ def run(self, step: Step) -> t.Optional[str]: task_id = self.task_manager.start_task( cmd, step.cwd, step.env, out=out_strm.fileno(), err=err_strm.fileno() ) - else: # pragma: no-cover - raise TypeError( - f"{type(self).__name__} is unable to launch a step of " - f"type {type(step)}" - ) self.step_mapping.add(step.name, step_id, task_id, step.managed) @@ -355,8 +354,7 @@ def stop(self, step_name: str) -> StepInfo: :rtype: StepInfo """ - if not self.is_connected: - raise LauncherError("Launcher is not connected to Dragon.") + self.ensure_connected() stepmap = self.step_mapping[step_name] step_id = str(stepmap.step_id) @@ -382,8 +380,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: :rtype: list[StepInfo] """ - if not self.is_connected: - raise LauncherError("Launcher is not connected to Dragon.") + self.ensure_connected() response = _assert_schema_type( self._send_request(DragonUpdateStatusRequest(step_ids=step_ids)), @@ -420,7 +417,6 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: def _send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse: if (socket := self._dragon_head_socket) is None: raise LauncherError("Launcher is not connected to Dragon") - return self.send_req_with_socket(socket, request, flags) def __str__(self) -> str: @@ -466,53 +462,58 @@ def send_req_with_socket( client = dragonSockets.as_client(socket) with DRG_LOCK: logger.debug(f"Sending {type(request).__name__}: {request}") - client.send(request, flags) - return client.recv() + send_trials = 5 + while send_trials: + try: + client.send(request, flags) + break + except zmq.Again as e: + send_trials -= 1 + logger.debug( + f"Could not send request in {client.socket.getsockopt(zmq.SNDTIMEO)/1000} seconds" + ) + if send_trials < 1: + raise e + + time.sleep(1) + receive_trials = 5 + while receive_trials: + try: + response = client.recv() + break + except zmq.Again as e: + receive_trials -= 1 + logger.debug( + f"Did not receive response in {client.socket.getsockopt(zmq.RCVTIMEO)/1000} seconds" + ) + if receive_trials < 1: + raise e + + logger.debug(f"Received {type(response).__name__}: {response}") + return response def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: if not isinstance(obj, typ): - raise TypeError("Expected schema of type `{typ}`, but got {type(obj)}") + raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}") return obj -def _dragon_cleanup( - server_socket: t.Optional[zmq.Socket[t.Any]] = None, - server_process_pid: t.Optional[int] = 0, - server_authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None, -) -> None: - """Clean up resources used by the launcher. - :param server_socket: (optional) Socket used to connect to dragon environment - :type server_socket: Optional[zmq.Socket] - :param server_process_pid: (optional) Process ID of the dragon entrypoint - :type server_process_pid: Optional[int] - :param server_authenticator: (optional) Authenticator used to secure sockets - :type server_authenticator: Optional[zmq.auth.thread.ThreadAuthenticator] - """ - +def _dragon_cleanup(server_socket: zmq.Socket[t.Any], server_process_pid: int) -> None: try: - if server_socket is not None: - DragonLauncher.send_req_with_socket(server_socket, DragonShutdownRequest()) + DragonLauncher.send_req_with_socket(server_socket, DragonShutdownRequest()) except zmq.error.ZMQError as e: # Can't use the logger as I/O file may be closed print("Could not send shutdown request to dragon server") print(f"ZMQ error: {e}", flush=True) finally: time.sleep(1) - - try: - if server_process_pid and psutil.pid_exists(server_process_pid): + try: os.kill(server_process_pid, signal.SIGINT) print("Sent SIGINT to dragon server") - except ProcessLookupError: - # Can't use the logger as I/O file may be closed - print("Dragon server is not running.", flush=True) - - try: - if server_authenticator is not None: - server_authenticator.stop() - except Exception: - print("Authenticator shutdown error") + except ProcessLookupError: + # Can't use the logger as I/O file may be closed + print("Dragon server is not running.", flush=True) def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py index bca361252..584eb554e 100644 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -153,9 +153,6 @@ def set_launcher( if self._launcher is None: raise SmartSimError("Launcher init failed") - if isinstance(self._launcher, DragonLauncher): - self._launcher.connect_to_dragon(exp_dir) - self.job_manager.set_launcher(self._launcher) self.job_manager.start() diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 39dae6aab..9d3528fa0 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -866,10 +866,7 @@ def _get_start_script_args( ] if cluster: cmd.append("+cluster") # is the shard part of a cluster - # TODO eliminate this as soon as output redirection works - # for dragon - if self.launcher == "dragon": - cmd.append("+redirect_output") + return cmd def _get_db_hosts(self) -> t.List[str]: diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 44d82b5e4..fbfe7d712 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -168,11 +168,19 @@ def __init__( if self._launcher == "cobalt": raise SSUnsupportedError("Cobalt launcher is no longer supported.") + if launcher == "dragon": + self._set_dragon_server_path() + self._control = Controller(launcher=self._launcher) self.db_identifiers: t.Set[str] = set() self._telemetry_cfg = ExperimentTelemetryConfiguration() + def _set_dragon_server_path(self): + """Set path for dragon server through environment varialbes""" + if not "SMARTSIM_DRAGON_SERVER_PATH" in environ: + environ["SMARTSIM_DRAGON_SERVER_PATH_EXP"] = self.exp_path + @_contextualize def start( self, diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index cd757e302..6c29b9fe9 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -24,20 +24,14 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import multiprocessing as mp -import os import typing as t import pytest -import zmq -from smartsim._core.config.config import get_config from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher -from smartsim._core.launcher.dragon.dragonSockets import get_secure_socket from smartsim._core.schemas.dragonRequests import DragonBootstrapRequest -from smartsim._core.schemas.dragonResponses import DragonHandshakeResponse -from smartsim._core.utils.network import IFConfig, find_free_port -from smartsim._core.utils.security import KeyManager +from smartsim._core.utils.network import IFConfig +from smartsim.error.errors import LauncherError # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -48,7 +42,7 @@ def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: ... @property def pid(self) -> int: - return 99999 + return 1 @property def returncode(self) -> int: @@ -84,252 +78,30 @@ def bind_address(self) -> str: return self._bind_address -class MockAuthenticator: - def __init__(self, context: zmq.Context) -> None: - self.num_starts: int = 0 - self.num_stops: int = 0 - self.num_configure_curves: int = 0 - self.context = context - - def configure_curve(self, *args, **kwargs) -> None: - self.cfg_args = args - self.cfg_kwargs = kwargs - self.num_configure_curves += 1 - - def start(self) -> None: - self.num_starts += 1 - - def stop(self) -> None: - self.num_stops += 1 - - def is_alive(self) -> bool: - return self.num_starts > 0 and self.num_stops == 0 - - -def mock_dragon_env(test_dir, *args, **kwargs): - """Create a mock dragon environment that can talk to the launcher through ZMQ""" - try: - context = zmq.Context() - addr = "127.0.0.1" - callback_port = kwargs["port"] - head_port = find_free_port(start=callback_port + 1) - - callback_socket, dragon_authenticator = get_secure_socket( - context, zmq.REQ, False - ) - - dragon_head_socket, dragon_authenticator = get_secure_socket( - context, zmq.REP, True, dragon_authenticator - ) - - full_addr = f"{addr}:{callback_port}" - callback_socket.connect(f"tcp://{full_addr}") - - full_head_addr = f"tcp://{addr}:{head_port}" - dragon_head_socket.bind(full_head_addr) - - req = DragonBootstrapRequest(address=full_head_addr) - - msg_sent = False - while not msg_sent: - callback_socket.send_string("bootstrap|" + req.json()) - # hold until bootstrap response is received - _ = callback_socket.recv() - msg_sent = True - - hand_shaken = False - while not hand_shaken: - # other side should set up a socket and push me a `HandshakeRequest` - _ = dragon_head_socket.recv() - # acknowledge handshake success w/DragonHandshakeResponse - handshake_ack = DragonHandshakeResponse(dragon_pid=os.getpid()) - dragon_head_socket.send_string(f"handshake|{handshake_ack.json()}") - - hand_shaken = True - except Exception as ex: - print(f"exception occurred while configuring mock handshaker: {ex}") - finally: - dragon_authenticator.stop() - callback_socket.close() - dragon_head_socket.close() - - def test_dragon_connect_bind_address(monkeypatch: pytest.MonkeyPatch, test_dir: str): """Test the connection to a dragon environment dynamically selects an open port in the range supplied""" with monkeypatch.context() as ctx: - # make sure we don't touch "real keys" during a test - ctx.setenv("SMARTSIM_KEY_PATH", test_dir) - - mock_socket = MockSocket() - - # look at test_dir for dragon config ctx.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) - # avoid finding real interface ctx.setattr( "smartsim._core.launcher.dragon.dragonLauncher.get_best_interface_and_address", lambda: IFConfig(interface="faux_interface", address="127.0.0.1"), ) - # we need to set the socket value or is_connected returns False ctx.setattr( "smartsim._core.launcher.dragon.dragonLauncher.DragonLauncher._handshake", - lambda self, address: setattr(self, "_dragon_head_socket", mock_socket), + lambda self, address: ..., ) - # avoid starting a real authenticator thread - ctx.setattr("zmq.auth.thread.ThreadAuthenticator", MockAuthenticator) - # avoid starting a real zmq socket + + mock_socket = MockSocket() + ctx.setattr("zmq.Context.socket", mock_socket) - # avoid starting a real process for dragon entrypoint ctx.setattr("subprocess.Popen", lambda *args, **kwargs: MockPopen()) dragon_launcher = DragonLauncher() - dragon_launcher.connect_to_dragon(test_dir) + with pytest.raises(LauncherError) as ex: + # it will complain about failure to connect when validating... + dragon_launcher.connect_to_dragon(test_dir) chosen_port = int(mock_socket.bind_address.split(":")[-1]) assert chosen_port >= 5995 - - -@pytest.mark.parametrize( - "socket_type, is_server", - [ - pytest.param(zmq.REQ, True, id="as-server"), - pytest.param(zmq.REP, False, id="as-client"), - ], -) -def test_secure_socket_authenticator_setup( - test_dir: str, monkeypatch: pytest.MonkeyPatch, socket_type: int, is_server: bool -): - """Ensure the authenticator created by the secure socket factory method - is fully configured and started when returned to a client""" - context = zmq.Context() - - with monkeypatch.context() as ctx: - # look at test dir for dragon config - ctx.setenv("SMARTSIM_KEY_PATH", test_dir) - # avoid starting a real authenticator thread - ctx.setattr("zmq.auth.thread.ThreadAuthenticator", MockAuthenticator) - - _, authenticator = get_secure_socket(context, socket_type, is_server=is_server) - - km = KeyManager(get_config(), as_server=is_server) - - assert isinstance(authenticator, MockAuthenticator) - - # ensure authenticator was configured - assert authenticator.num_configure_curves > 0 - # ensure authenticator was started - assert authenticator.num_starts > 0 - assert authenticator.context == context - # ensure authenticator will accept any secured connection - assert authenticator.cfg_kwargs.get("domain", "") == "*" - # ensure authenticator is using the expected set of keys - assert authenticator.cfg_kwargs.get("location", "") == km.client_keys_dir - - -@pytest.mark.parametrize( - "as_server", - [ - pytest.param(True, id="server-socket"), - pytest.param(False, id="client-socket"), - ], -) -def test_secure_socket_setup( - test_dir: str, monkeypatch: pytest.MonkeyPatch, as_server: bool -): - """Ensure the authenticator created by the secure socket factory method - is fully configured and started when returned to a client""" - context = zmq.Context() - - with monkeypatch.context() as ctx: - # look at test dir for dragon config - ctx.setenv("SMARTSIM_KEY_PATH", test_dir) - # avoid starting a real authenticator thread - ctx.setattr("zmq.auth.thread.ThreadAuthenticator", MockAuthenticator) - - socket, _ = get_secure_socket(context, zmq.REP, as_server) - - # verify the socket is correctly configured to use curve authentication - assert bool(socket.CURVE_SERVER) == as_server - assert not socket.closed - - socket.close() - - -def test_secure_socket(test_dir: str, monkeypatch: pytest.MonkeyPatch): - """Ensure the authenticator created by the secure socket factory method - is fully configured and started when returned to a client""" - - with monkeypatch.context() as ctx: - # make sure we don't touch "real keys" during a test - ctx.setenv("SMARTSIM_KEY_PATH", test_dir) - - context = zmq.Context() - server, authenticator = get_secure_socket(context, zmq.REP, True) - - ip, port = "127.0.0.1", find_free_port(start=9999) - - try: - server.bind(f"tcp://*:{port}") - - client, authenticator = get_secure_socket( - context, zmq.REQ, False, authenticator - ) - - client.connect(f"tcp://{ip}:{port}") - - to_send = "you get a foo! you get a foo! everybody gets a foo!" - client.send_string(to_send, flags=zmq.NOBLOCK) - - received_msg = server.recv_string() - assert received_msg == to_send - print("server receieved: ", received_msg) - finally: - if authenticator: - authenticator.stop() - if client: - client.close() - if server: - server.close() - - -# def test_dragon_launcher_handshake(monkeypatch: pytest.MonkeyPatch, test_dir: str): -# """Test that a real handshake between a launcher & dragon environment -# completes successfully using secure sockets""" -# context = zmq.Context() -# addr = "127.0.0.1" -# bootstrap_port = find_free_port(start=5995) - -# with monkeypatch.context() as ctx: -# # make sure we don't touch "real keys" during a test -# ctx.setenv("SMARTSIM_KEY_PATH", test_dir) - -# # look at test dir for dragon config -# ctx.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) -# # avoid finding real interface since we may not be on a super -# ctx.setattr( -# "smartsim._core.launcher.dragon.dragonLauncher.get_best_interface_and_address", -# lambda: IFConfig("faux_interface", addr), -# ) - -# # start up a faux dragon env that knows how to do the handshake process -# # but uses secure sockets for all communication. -# mock_dragon = mp.Process( -# target=mock_dragon_env, -# daemon=True, -# kwargs={"port": bootstrap_port, "test_dir": test_dir}, -# ) - -# def fn(*args, **kwargs): -# mock_dragon.start() -# return mock_dragon - -# ctx.setattr("subprocess.Popen", fn) - -# launcher = DragonLauncher() - -# try: -# # connect executes the complete handshake and raises an exception if comms fails -# launcher.connect_to_dragon(test_dir) -# finally: -# launcher.cleanup() From 8911f342b4fcfeb36f94c5024e905ce6434b1abc Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 5 Apr 2024 12:37:04 -0500 Subject: [PATCH 002/101] Seaparate Connector from Launcher --- smartsim/_core/entrypoints/redis.py | 1 - .../_core/launcher/dragon/dragonBackend.py | 122 +++--- .../_core/launcher/dragon/dragonConnector.py | 388 ++++++++++++++++++ .../_core/launcher/dragon/dragonLauncher.py | 336 +-------------- smartsim/_core/utils/telemetry/telemetry.py | 6 +- smartsim/experiment.py | 2 +- 6 files changed, 475 insertions(+), 380 deletions(-) create mode 100644 smartsim/_core/launcher/dragon/dragonConnector.py diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index 310a4fb66..b856fdb43 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -28,7 +28,6 @@ import json import os import signal -import sys import textwrap import typing as t from subprocess import PIPE, STDOUT diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 943452e4a..63e0ba018 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -86,16 +86,15 @@ def redir_worker(io_conn: Connection, file_path: str) -> None: :param file_path: path to file to write to :type file_path: str """ - file_to_write = open(file_path, "a") try: - while True: - output = io_conn.recv() - print(output, flush=True, file=file_to_write, end="") + with open(file_path, "a", encoding="utf-8") as file_to_write: + while True: + output = io_conn.recv() + print(output, flush=True, file=file_to_write, end="") except EOFError: pass finally: io_conn.close() - file_to_write.close() class DragonBackend: @@ -206,8 +205,41 @@ def _(self, request: DragonRunRequest) -> DragonRunResponse: ) return DragonRunResponse(step_id=step_id) - def update(self) -> None: - self._updates += 1 + @staticmethod + def _start_redirect_workers( + global_policy: Policy, + policies: t.List[Policy], + puids: t.List[int], + out_file: t.Optional[str], + err_file: t.Optional[str], + ) -> None: + grp_redir = ProcessGroup(restart=False, policy=global_policy) + for pol, puid in zip(policies, puids): + proc = Process(None, ident=puid) + if out_file: + grp_redir.add_process( + nproc=1, + template=ProcessTemplate( + target=redir_worker, + args=(proc.stdout_conn, out_file), + stdout=Popen.DEVNULL, + policy=pol, + ), + ) + if err_file: + grp_redir.add_process( + nproc=1, + template=ProcessTemplate( + target=redir_worker, + args=(proc.stderr_conn, err_file), + stdout=Popen.DEVNULL, + policy=pol, + ), + ) + grp_redir.init() + grp_redir.start() + + def _start_steps(self) -> None: started = [] for step_id, request in self._queued_steps.items(): hosts = self._allocate_step(step_id, self._queued_steps[step_id]) @@ -254,29 +286,13 @@ def update(self) -> None: started.append(step_id) try: - grp_redir = ProcessGroup(restart=False, policy=global_policy) - for pol, puid in zip(policies, puids): - proc = Process(None, ident=puid) - grp_redir.add_process( - nproc=1, - template=ProcessTemplate( - target=redir_worker, - args=(proc.stdout_conn, request.output_file), - stdout=Popen.DEVNULL, - policy=pol, - ), - ) - grp_redir.add_process( - nproc=1, - template=ProcessTemplate( - target=redir_worker, - args=(proc.stderr_conn, request.error_file), - stdout=Popen.DEVNULL, - policy=pol, - ), - ) - grp_redir.init() - grp_redir.start() + DragonBackend._start_redirect_workers( + global_policy, + policies, + puids, + request.output_file, + request.error_file, + ) except Exception as e: raise IOError("Could not redirect output") from e @@ -286,30 +302,36 @@ def update(self) -> None: for step_id in started: self._queued_steps.pop(step_id) + def _refresh_statuses(self) -> None: terminated = [] for step_id in self._running_steps: group_info = self._group_infos[step_id] grp = group_info.process_group - if grp.status == DRG_RUNNING_STATUS: - group_info.status = SmartSimStatus.STATUS_RUNNING + if grp is None: + group_info.status = SmartSimStatus.STATUS_FAILED + group_info.return_codes = [-1] else: - puids = group_info.puids - if puids is not None and all(puid is not None for puid in puids): - try: - group_info.return_codes = [ - Process(None, ident=puid).returncode for puid in puids - ] - except (ValueError, TypeError) as e: - print(e) - group_info.return_codes = [-1 for _ in puids] + if grp.status == DRG_RUNNING_STATUS: + group_info.status = SmartSimStatus.STATUS_RUNNING else: - group_info.return_codes = [0] - group_info.status = ( - SmartSimStatus.STATUS_FAILED - if any(group_info.return_codes) or grp.status == DRG_ERROR_STATUS - else SmartSimStatus.STATUS_COMPLETED - ) + puids = group_info.puids + if puids is not None and all(puid is not None for puid in puids): + try: + group_info.return_codes = [ + Process(None, ident=puid).returncode for puid in puids + ] + except (ValueError, TypeError) as e: + print(e) + group_info.return_codes = [-1 for _ in puids] + else: + group_info.return_codes = [0] + group_info.status = ( + SmartSimStatus.STATUS_FAILED + if any(group_info.return_codes) + or grp.status == DRG_ERROR_STATUS + else SmartSimStatus.STATUS_COMPLETED + ) if group_info.status in TERMINAL_STATUSES: terminated.append(step_id) @@ -327,6 +349,12 @@ def update(self) -> None: self._allocated_hosts.pop(host) self._free_hosts.append(host) + def update(self) -> None: + self._updates += 1 + + self._start_steps() + self._refresh_statuses() + @process_request.register def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: return DragonUpdateStatusResponse( diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py new file mode 100644 index 000000000..867cff055 --- /dev/null +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -0,0 +1,388 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import atexit +import fileinput +import itertools +import json +import os +import signal +import subprocess +import sys +import time +import typing as t +from pathlib import Path +from threading import RLock + +import zmq + +from smartsim._core.launcher.dragon import dragonSockets +from smartsim.error.errors import SmartSimError + +from ....log import get_logger +from ...config import CONFIG +from ...schemas import ( + DragonBootstrapRequest, + DragonBootstrapResponse, + DragonHandshakeRequest, + DragonHandshakeResponse, + DragonRequest, + DragonResponse, + DragonShutdownRequest, +) +from ...utils.network import find_free_port, get_best_interface_and_address + +logger = get_logger(__name__) + +_SchemaT = t.TypeVar("_SchemaT", bound=t.Union[DragonRequest, DragonResponse]) + +DRG_LOCK = RLock() +DRG_CTX = zmq.Context() +DRG_CTX.setsockopt(zmq.REQ_CORRELATE, 1) +DRG_CTX.setsockopt(zmq.REQ_RELAXED, 1) + + +class DragonConnector: + """This class encapsulates the functionality needed + to launch start a Dragon server and communicate with it. + + """ + + def __init__(self) -> None: + super().__init__() + self._context = DRG_CTX + self._timeout = CONFIG.dragon_server_timeout + self._reconnect_timeout = CONFIG.dragon_server_reconnect_timeout + self._startup_timeout = CONFIG.dragon_server_startup_timeout + self._context.setsockopt(zmq.SNDTIMEO, value=self._timeout) + self._context.setsockopt(zmq.RCVTIMEO, value=self._timeout) + self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None + self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None + # Returned by dragon head, useful if shutdown is to be requested + # but process was started by another connector + self._dragon_head_pid: t.Optional[int] = None + self._dragon_server_path = os.getenv( + "SMARTSIM_DRAGON_SERVER_PATH_EXP", + os.getenv("SMARTSIM_DRAGON_SERVER_PATH", None), + ) + if self._dragon_server_path is None: + raise SmartSimError( + "DragonConnector could not find the dragon server path. " + "This should not happen if the Connector was started by an " + "experiment.\nIf the DragonConnector was started manually, " + "then the environment variable SMARTSIM_DRAGON_SERVER_PATH " + "should be set to an existing directory." + ) + + @property + def is_connected(self) -> bool: + return self._dragon_head_socket is not None + + def _handshake(self, address: str) -> None: + self._dragon_head_socket = self._context.socket(zmq.REQ) + self._dragon_head_socket.connect(address) + try: + dragon_handshake = _assert_schema_type( + self.send_request(DragonHandshakeRequest()), DragonHandshakeResponse + ) + self._dragon_head_pid = dragon_handshake.dragon_pid + logger.debug( + f"Successful handshake with Dragon server at address {address}" + ) + except (zmq.ZMQError, zmq.Again) as e: + logger.debug(e) + self._dragon_head_socket.close() + self._dragon_head_socket = None + raise SmartSimError( + f"Unsuccessful handshake with Dragon server at address {address}" + ) from e + + def _set_timeout(self, timeout: int) -> None: + self._context.setsockopt(zmq.SNDTIMEO, value=timeout) + self._context.setsockopt(zmq.RCVTIMEO, value=timeout) + + def ensure_connected(self) -> None: + if not self.is_connected: + self.connect_to_dragon() + if not self.is_connected: + raise SmartSimError("Could not connect to Dragon server") + + # pylint: disable-next=too-many-statements,too-many-locals + def connect_to_dragon(self) -> None: + with DRG_LOCK: + # TODO use manager instead + if self.is_connected: + return + + if self._dragon_server_path is None: + raise SmartSimError("Path to Dragon server not set.") + + path = _resolve_dragon_path(self._dragon_server_path) + dragon_config_log = path / CONFIG.dragon_log_filename + + if dragon_config_log.is_file(): + dragon_confs = self._parse_launched_dragon_server_info_from_files( + [dragon_config_log] + ) + logger.debug(dragon_confs) + for dragon_conf in dragon_confs: + if not "address" in dragon_conf: + continue + logger.debug( + "Found dragon server config file. Checking if the server" + f" is still up at address {dragon_conf['address']}." + ) + try: + self._set_timeout(self._reconnect_timeout) + self._handshake(dragon_conf["address"]) + except SmartSimError as e: + logger.warning(e) + finally: + self._set_timeout(self._timeout) + if self.is_connected: + return + + path.mkdir(parents=True, exist_ok=True) + + cmd = [ + "dragon", + sys.executable, + "-m", + "smartsim._core.entrypoints.dragon", + ] + + address = get_best_interface_and_address().address + socket_addr = "" + connector_socket: t.Optional[zmq.Socket[t.Any]] = None + if address is not None: + self._set_timeout(self._startup_timeout) + connector_socket = self._context.socket(zmq.REP) + + # find first available port >= 5995 + port = find_free_port(start=5995) + socket_addr = f"tcp://{address}:{port}" + logger.debug(f"Binding connector to {socket_addr}") + + connector_socket.bind(socket_addr) + cmd += ["+launching_address", socket_addr] + + dragon_out_file = path / "dragon_head.out" + dragon_err_file = path / "dragon_head.err" + + with open(dragon_out_file, "w", encoding="utf-8") as dragon_out, open( + dragon_err_file, "w", encoding="utf-8" + ) as dragon_err: + current_env = os.environ.copy() + current_env.update({"PYTHONUNBUFFERED": "1"}) + # pylint: disable-next=consider-using-with + self._dragon_head_process = subprocess.Popen( + args=cmd, + bufsize=0, + stderr=dragon_err.fileno(), + stdout=dragon_out.fileno(), + cwd=path, + shell=False, + env=current_env, + start_new_session=True, + ) + + if connector_socket is None: + raise SmartSimError("Socket failed to initialize") + + def log_dragon_outputs() -> None: + if self._dragon_head_process: + self._dragon_head_process.wait(1.0) + if self._dragon_head_process.stdout: + for line in iter( + self._dragon_head_process.stdout.readline, b"" + ): + logger.info(line.decode("utf-8").rstrip()) + if self._dragon_head_process.stderr: + for line in iter( + self._dragon_head_process.stderr.readline, b"" + ): + logger.warning(line.decode("utf-8").rstrip()) + logger.warning(self._dragon_head_process.returncode) + + if address is not None: + server = dragonSockets.as_server(connector_socket) + logger.debug(f"Listening to {socket_addr}") + request = _assert_schema_type(server.recv(), DragonBootstrapRequest) + + logger.debug(f"Connecting to {request.address}") + server.send( + DragonBootstrapResponse(dragon_pid=self._dragon_head_process.pid) + ) + + connector_socket.close() + self._set_timeout(self._timeout) + self._handshake(request.address) + + # Only the Connector which started the server is + # responsible of it, that's why we register the + # cleanup in this code branch. + # The cleanup function should not have references + # to this object to avoid Garbage Collector lockup + server_socket = self._dragon_head_socket + server_process_pid = self._dragon_head_process.pid + + if server_socket is not None and self._dragon_head_process is not None: + atexit.register( + _dragon_cleanup, + server_socket=server_socket, + server_process_pid=server_process_pid, + ) + else: + # TODO parse output file + log_dragon_outputs() + raise SmartSimError("Could not receive address of Dragon head process") + + def cleanup(self) -> None: + if self._dragon_head_socket is not None and self._dragon_head_pid is not None: + _dragon_cleanup( + server_socket=self._dragon_head_socket, + server_process_pid=self._dragon_head_pid, + ) + + def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse: + self.ensure_connected() + if (socket := self._dragon_head_socket) is None: + raise SmartSimError("Not connected to Dragon") + return self._send_req_with_socket(socket, request, flags) + + @staticmethod + def _parse_launched_dragon_server_info_from_iterable( + stream: t.Iterable[str], num_dragon_envs: t.Optional[int] = None + ) -> t.List[t.Dict[str, str]]: + lines = (line.strip() for line in stream) + lines = (line for line in lines if line) + tokenized = (line.split(maxsplit=1) for line in lines) + tokenized = (tokens for tokens in tokenized if len(tokens) > 1) + dragon_env_jsons = ( + config_dict + for first, config_dict in tokenized + if "DRAGON_SERVER_CONFIG" in first + ) + dragon_envs = [json.loads(config_dict) for config_dict in dragon_env_jsons] + + if num_dragon_envs: + sliced_dragon_envs = itertools.islice(dragon_envs, num_dragon_envs) + return list(sliced_dragon_envs) + return dragon_envs + + @classmethod + def _parse_launched_dragon_server_info_from_files( + cls, + file_paths: t.List[t.Union[str, "os.PathLike[str]"]], + num_dragon_envs: t.Optional[int] = None, + ) -> t.List[t.Dict[str, str]]: + with fileinput.FileInput(file_paths) as ifstream: + dragon_envs = cls._parse_launched_dragon_server_info_from_iterable( + ifstream, num_dragon_envs + ) + return dragon_envs + + @staticmethod + def _send_req_with_socket( + socket: zmq.Socket[t.Any], request: DragonRequest, flags: int = 0 + ) -> DragonResponse: + client = dragonSockets.as_client(socket) + with DRG_LOCK: + logger.debug(f"Sending {type(request).__name__}: {request}") + send_trials = 5 + while send_trials: + try: + client.send(request, flags) + break + except zmq.Again as e: + send_trials -= 1 + logger.debug( + "Could not send request to Dragon server in " + f"{int(client.socket.getsockopt(zmq.SNDTIMEO))/1000} seconds" + ) + if send_trials < 1: + raise e + + time.sleep(0.1) + receive_trials = 5 + response = None + while receive_trials: + try: + response = client.recv() + break + except zmq.Again as e: + receive_trials -= 1 + logger.debug( + "Did not receive response from Dragon server in " + f"{int(client.socket.getsockopt(zmq.RCVTIMEO))/1000} seconds" + ) + if receive_trials < 1: + raise e + + if response is None: + raise SmartSimError("Could not receive response from Dragon server") + + logger.debug(f"Received {type(response).__name__}: {response}") + return response + + +def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: + if not isinstance(obj, typ): + raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}") + return obj + + +def _dragon_cleanup(server_socket: zmq.Socket[t.Any], server_process_pid: int) -> None: + try: + # pylint: disable-next=protected-access + DragonConnector._send_req_with_socket(server_socket, DragonShutdownRequest()) + except zmq.error.ZMQError as e: + # Can't use the logger as I/O file may be closed + print("Could not send shutdown request to dragon server") + print(f"ZMQ error: {e}", flush=True) + finally: + time.sleep(1) + try: + os.kill(server_process_pid, signal.SIGINT) + print("Sent SIGINT to dragon server") + except ProcessLookupError: + # Can't use the logger as I/O file may be closed + print("Dragon server is not running.", flush=True) + + +def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: + dragon_server_path = CONFIG.dragon_server_path or os.path.join( + fallback, ".smartsim", "dragon" + ) + dragon_server_paths = dragon_server_path.split(":") + if len(dragon_server_paths) > 1: + logger.warning( + "Multiple dragon servers not supported, " + "will connect to (or start) first server in list." + ) + return Path(dragon_server_paths[0]) diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 1a2d5ad46..7b71c6fb8 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -26,58 +26,29 @@ from __future__ import annotations -import atexit -import fileinput -import itertools -import json import os -import signal -import subprocess -import sys -import time import typing as t -from pathlib import Path -from threading import RLock -import zmq - -from smartsim._core.launcher.dragon import dragonSockets -from smartsim.error.errors import SmartSimError +from smartsim._core.launcher.dragon.dragonConnector import DragonConnector, _SchemaT from ....error import LauncherError from ....log import get_logger from ....settings import DragonRunSettings, RunSettings, SettingsBase from ....status import SmartSimStatus -from ...config import CONFIG from ...schemas import ( - DragonBootstrapRequest, - DragonBootstrapResponse, - DragonHandshakeRequest, - DragonHandshakeResponse, - DragonRequest, - DragonResponse, DragonRunRequest, DragonRunResponse, - DragonShutdownRequest, DragonStopRequest, DragonStopResponse, DragonUpdateStatusRequest, DragonUpdateStatusResponse, ) -from ...utils.network import find_free_port, get_best_interface_and_address from ..launcher import WLMLauncher from ..step import DragonStep, LocalStep, Step from ..stepInfo import StepInfo logger = get_logger(__name__) -_SchemaT = t.TypeVar("_SchemaT", bound=t.Union[DragonRequest, DragonResponse]) - -DRG_LOCK = RLock() -DRG_CTX = zmq.Context() -DRG_CTX.setsockopt(zmq.REQ_CORRELATE, 1) -DRG_CTX.setsockopt(zmq.REQ_RELAXED, 1) - class DragonLauncher(WLMLauncher): """This class encapsulates the functionality needed @@ -92,196 +63,14 @@ class DragonLauncher(WLMLauncher): def __init__(self) -> None: super().__init__() - self._context = DRG_CTX - self._timeout = CONFIG.dragon_server_timeout - self._reconnect_timeout = CONFIG.dragon_server_reconnect_timeout - self._startup_timeout = CONFIG.dragon_server_startup_timeout - self._context.setsockopt(zmq.SNDTIMEO, value=self._timeout) - self._context.setsockopt(zmq.RCVTIMEO, value=self._timeout) - self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None - self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None - # Returned by dragon head, useful if shutdown is to be requested - # but process was started by another launcher - self._dragon_head_pid: t.Optional[int] = None - self._dragon_server_path = os.getenv( - "SMARTSIM_DRAGON_SERVER_PATH_EXP", - os.getenv("SMARTSIM_DRAGON_SERVER_PATH", None), - ) - if self._dragon_server_path is None: - raise SmartSimError( - "Dragon server path was not set. " - "This should not happen if the launcher was started by an experiment.\n" - "If the DragonLauncher was started manually, " - "then the environment variable SMARTSIM_DRAGON_SERVER_PATH " - "should be set to an existing directory." - ) + self._connector = DragonConnector() @property def is_connected(self) -> bool: - return self._dragon_head_socket is not None - - def _handshake(self, address: str) -> None: - self._dragon_head_socket = self._context.socket(zmq.REQ) - self._dragon_head_socket.connect(address) - try: - dragon_handshake = _assert_schema_type( - self._send_request(DragonHandshakeRequest()), DragonHandshakeResponse - ) - self._dragon_head_pid = dragon_handshake.dragon_pid - logger.debug( - f"Successful handshake with Dragon server at address {address}" - ) - except (zmq.ZMQError, zmq.Again) as e: - logger.debug(e) - self._dragon_head_socket.close() - self._dragon_head_socket = None - raise LauncherError( - f"Unsuccessful handshake with Dragon server at address {address}" - ) from e - - def _set_timeout(self, timeout: int) -> None: - self._context.setsockopt(zmq.SNDTIMEO, value=timeout) - self._context.setsockopt(zmq.RCVTIMEO, value=timeout) - - def ensure_connected(self) -> None: - if not self.is_connected: - self._connect_to_dragon() - if not self.is_connected: - raise LauncherError("Could not connect to Dragon server") - - # pylint: disable-next=too-many-statements,too-many-locals - def _connect_to_dragon(self) -> None: - with DRG_LOCK: - # TODO use manager instead - if self.is_connected: - return - - path = _resolve_dragon_path(self._dragon_server_path) - dragon_config_log = path / CONFIG.dragon_log_filename - - if dragon_config_log.is_file(): - dragon_confs = self._parse_launched_dragon_server_info_from_files( - [dragon_config_log] - ) - logger.debug(dragon_confs) - for dragon_conf in dragon_confs: - if not "address" in dragon_conf: - continue - logger.debug( - "Found dragon server config file. Checking if the server" - f" is still up at address {dragon_conf['address']}." - ) - try: - self._set_timeout(self._reconnect_timeout) - self._handshake(dragon_conf["address"]) - except LauncherError as e: - logger.warning(e) - finally: - self._set_timeout(self._timeout) - if self.is_connected: - return - - path.mkdir(parents=True, exist_ok=True) - - cmd = [ - "dragon", - sys.executable, - "-m", - "smartsim._core.entrypoints.dragon", - ] - - address = get_best_interface_and_address().address - socket_addr = "" - launcher_socket: t.Optional[zmq.Socket[t.Any]] = None - if address is not None: - self._set_timeout(self._startup_timeout) - launcher_socket = self._context.socket(zmq.REP) - - # find first available port >= 5995 - port = find_free_port(start=5995) - socket_addr = f"tcp://{address}:{port}" - logger.debug(f"Binding launcher to {socket_addr}") - - launcher_socket.bind(socket_addr) - cmd += ["+launching_address", socket_addr] - - dragon_out_file = path / "dragon_head.out" - dragon_err_file = path / "dragon_head.err" - - with open(dragon_out_file, "w", encoding="utf-8") as dragon_out, open( - dragon_err_file, "w", encoding="utf-8" - ) as dragon_err: - current_env = os.environ.copy() - current_env.update({"PYTHONUNBUFFERED": "1"}) - # pylint: disable-next=consider-using-with - self._dragon_head_process = subprocess.Popen( - args=cmd, - bufsize=0, - stderr=dragon_err.fileno(), - stdout=dragon_out.fileno(), - cwd=path, - shell=False, - env=current_env, - start_new_session=True, - ) - - if launcher_socket is None: - raise SmartSimError("Socket failed to initialize") - - def log_dragon_outputs() -> None: - if self._dragon_head_process: - self._dragon_head_process.wait(1.0) - if self._dragon_head_process.stdout: - for line in iter( - self._dragon_head_process.stdout.readline, b"" - ): - logger.info(line.decode("utf-8").rstrip()) - if self._dragon_head_process.stderr: - for line in iter( - self._dragon_head_process.stderr.readline, b"" - ): - logger.warning(line.decode("utf-8").rstrip()) - logger.warning(self._dragon_head_process.returncode) - - if address is not None: - server = dragonSockets.as_server(launcher_socket) - logger.debug(f"Listening to {socket_addr}") - request = _assert_schema_type(server.recv(), DragonBootstrapRequest) - - logger.debug(f"Connecting launcher to {request.address}") - server.send( - DragonBootstrapResponse(dragon_pid=self._dragon_head_process.pid) - ) - - launcher_socket.close() - self._set_timeout(self._timeout) - self._handshake(request.address) - - # Only the launcher which started the server is - # responsible of it, that's why we register the - # cleanup in this code branch. - # The cleanup function should not have references - # to this object to avoid Garbage Collector lockup - server_socket = self._dragon_head_socket - server_process_pid = self._dragon_head_process.pid - - if server_socket is not None and self._dragon_head_process is not None: - atexit.register( - _dragon_cleanup, - server_socket=server_socket, - server_process_pid=server_process_pid, - ) - else: - # TODO parse output file - log_dragon_outputs() - raise LauncherError("Could not receive address of Dragon head process") + return self._connector.is_connected def cleanup(self) -> None: - if self._dragon_head_socket is not None and self._dragon_head_pid is not None: - _dragon_cleanup( - server_socket=self._dragon_head_socket, - server_process_pid=self._dragon_head_pid, - ) + self._connector.cleanup() # RunSettings types supported by this launcher @property @@ -309,13 +98,12 @@ def run(self, step: Step) -> t.Optional[str]: out, err = step.get_output_files() if isinstance(step, DragonStep): - self.ensure_connected() run_args = step.run_settings.run_args env = step.run_settings.env_vars nodes = int(run_args.get("nodes", None) or 1) tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) response = _assert_schema_type( - self._send_request( + self._connector.send_request( DragonRunRequest( exe=cmd[0], exe_args=cmd[1:], @@ -354,12 +142,13 @@ def stop(self, step_name: str) -> StepInfo: :rtype: StepInfo """ - self.ensure_connected() + self._connector.ensure_connected() stepmap = self.step_mapping[step_name] step_id = str(stepmap.step_id) _assert_schema_type( - self._send_request(DragonStopRequest(step_id=step_id)), DragonStopResponse + self._connector.send_request(DragonStopRequest(step_id=step_id)), + DragonStopResponse, ) _, step_info = self.get_step_update([step_name])[0] @@ -380,10 +169,8 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: :rtype: list[StepInfo] """ - self.ensure_connected() - response = _assert_schema_type( - self._send_request(DragonUpdateStatusRequest(step_ids=step_ids)), + self._connector.send_request(DragonUpdateStatusRequest(step_ids=step_ids)), DragonUpdateStatusResponse, ) @@ -414,116 +201,11 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: updates.append(info) return updates - def _send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse: - if (socket := self._dragon_head_socket) is None: - raise LauncherError("Launcher is not connected to Dragon") - return self.send_req_with_socket(socket, request, flags) - def __str__(self) -> str: return "Dragon" - @staticmethod - def _parse_launched_dragon_server_info_from_iterable( - stream: t.Iterable[str], num_dragon_envs: t.Optional[int] = None - ) -> t.List[t.Dict[str, str]]: - lines = (line.strip() for line in stream) - lines = (line for line in lines if line) - tokenized = (line.split(maxsplit=1) for line in lines) - tokenized = (tokens for tokens in tokenized if len(tokens) > 1) - dragon_env_jsons = ( - config_dict - for first, config_dict in tokenized - if "DRAGON_SERVER_CONFIG" in first - ) - dragon_envs = [json.loads(config_dict) for config_dict in dragon_env_jsons] - - if num_dragon_envs: - sliced_dragon_envs = itertools.islice(dragon_envs, num_dragon_envs) - return list(sliced_dragon_envs) - return dragon_envs - - @classmethod - def _parse_launched_dragon_server_info_from_files( - cls, - file_paths: t.List[t.Union[str, "os.PathLike[str]"]], - num_dragon_envs: t.Optional[int] = None, - ) -> t.List[t.Dict[str, str]]: - with fileinput.FileInput(file_paths) as ifstream: - dragon_envs = cls._parse_launched_dragon_server_info_from_iterable( - ifstream, num_dragon_envs - ) - - return dragon_envs - - @staticmethod - def send_req_with_socket( - socket: zmq.Socket[t.Any], request: DragonRequest, flags: int = 0 - ) -> DragonResponse: - client = dragonSockets.as_client(socket) - with DRG_LOCK: - logger.debug(f"Sending {type(request).__name__}: {request}") - send_trials = 5 - while send_trials: - try: - client.send(request, flags) - break - except zmq.Again as e: - send_trials -= 1 - logger.debug( - f"Could not send request in {client.socket.getsockopt(zmq.SNDTIMEO)/1000} seconds" - ) - if send_trials < 1: - raise e - - time.sleep(1) - receive_trials = 5 - while receive_trials: - try: - response = client.recv() - break - except zmq.Again as e: - receive_trials -= 1 - logger.debug( - f"Did not receive response in {client.socket.getsockopt(zmq.RCVTIMEO)/1000} seconds" - ) - if receive_trials < 1: - raise e - - logger.debug(f"Received {type(response).__name__}: {response}") - return response - def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: if not isinstance(obj, typ): raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}") return obj - - -def _dragon_cleanup(server_socket: zmq.Socket[t.Any], server_process_pid: int) -> None: - try: - DragonLauncher.send_req_with_socket(server_socket, DragonShutdownRequest()) - except zmq.error.ZMQError as e: - # Can't use the logger as I/O file may be closed - print("Could not send shutdown request to dragon server") - print(f"ZMQ error: {e}", flush=True) - finally: - time.sleep(1) - try: - os.kill(server_process_pid, signal.SIGINT) - print("Sent SIGINT to dragon server") - except ProcessLookupError: - # Can't use the logger as I/O file may be closed - print("Dragon server is not running.", flush=True) - - -def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: - dragon_server_path = CONFIG.dragon_server_path or os.path.join( - fallback, ".smartsim", "dragon" - ) - dragon_server_paths = dragon_server_path.split(":") - if len(dragon_server_paths) > 1: - logger.warning( - "Multiple dragon servers not supported, " - "will connect to (or start) first server in list." - ) - return Path(dragon_server_paths[0]) diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py index 584eb554e..eb0affe87 100644 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -144,9 +144,7 @@ def init_job_manager(self) -> None: self.job_manager.set_launcher(self._launcher) self.job_manager.start() - def set_launcher( - self, launcher_type: str, exp_dir: t.Union[str, "os.PathLike[str]"] - ) -> None: + def set_launcher(self, launcher_type: str) -> None: """Set the launcher for the experiment""" self.init_launcher(launcher_type) @@ -180,7 +178,7 @@ def process_manifest(self, manifest_path: str) -> None: exp_dir = pathlib.Path(manifest_path).parent.parent.parent if self._launcher is None: - self.set_launcher(manifest.launcher, exp_dir) + self.set_launcher(manifest.launcher) if not self._launcher: raise SmartSimError(f"Unable to set launcher from {manifest_path}") diff --git a/smartsim/experiment.py b/smartsim/experiment.py index fbfe7d712..e79b252f6 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -176,7 +176,7 @@ def __init__( self.db_identifiers: t.Set[str] = set() self._telemetry_cfg = ExperimentTelemetryConfiguration() - def _set_dragon_server_path(self): + def _set_dragon_server_path(self) -> None: """Set path for dragon server through environment varialbes""" if not "SMARTSIM_DRAGON_SERVER_PATH" in environ: environ["SMARTSIM_DRAGON_SERVER_PATH_EXP"] = self.exp_path From 6046bc55c3bb198359257762317f1caeaf898d55 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 5 Apr 2024 18:47:51 -0500 Subject: [PATCH 003/101] Create batch script for DragonBatchStep --- conftest.py | 2 +- .../_core/launcher/dragon/dragonLauncher.py | 125 ++++++++++---- .../_core/launcher/slurm/slurmLauncher.py | 1 + smartsim/_core/launcher/step/__init__.py | 2 +- smartsim/_core/launcher/step/dragonStep.py | 159 +++++++++++++++++- smartsim/settings/settings.py | 4 +- tests/full_wlm/test_generic_batch_launch.py | 3 +- 7 files changed, 256 insertions(+), 40 deletions(-) diff --git a/conftest.py b/conftest.py index 86ccf08a3..43f0fc773 100644 --- a/conftest.py +++ b/conftest.py @@ -45,7 +45,7 @@ import smartsim from smartsim import Experiment -from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher, _dragon_cleanup +from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher from smartsim._core.config import CONFIG from smartsim._core.config.config import Config from smartsim._core.utils.telemetry.telemetry import JobEntity diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 7b71c6fb8..69802cd8e 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -29,11 +29,15 @@ import os import typing as t -from smartsim._core.launcher.dragon.dragonConnector import DragonConnector, _SchemaT - from ....error import LauncherError from ....log import get_logger -from ....settings import DragonRunSettings, RunSettings, SettingsBase +from ....settings import ( + DragonRunSettings, + QsubBatchSettings, + RunSettings, + SbatchSettings, + SettingsBase, +) from ....status import SmartSimStatus from ...schemas import ( DragonRunRequest, @@ -44,8 +48,10 @@ DragonUpdateStatusResponse, ) from ..launcher import WLMLauncher -from ..step import DragonStep, LocalStep, Step +from ..slurm.slurmLauncher import SlurmLauncher +from ..step import DragonBatchStep, DragonStep, LocalStep, Step from ..stepInfo import StepInfo +from .dragonConnector import DragonConnector, _SchemaT logger = get_logger(__name__) @@ -64,6 +70,7 @@ class DragonLauncher(WLMLauncher): def __init__(self) -> None: super().__init__() self._connector = DragonConnector() + self._slurm_launcher = SlurmLauncher() @property def is_connected(self) -> bool: @@ -76,7 +83,12 @@ def cleanup(self) -> None: @property def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: # RunSettings types supported by this launcher - return {DragonRunSettings: DragonStep, RunSettings: LocalStep} + return { + DragonRunSettings: DragonStep, + SbatchSettings: DragonBatchStep, + QsubBatchSettings: DragonBatchStep, + RunSettings: LocalStep, + } def run(self, step: Step) -> t.Optional[str]: """Run a job step through Slurm @@ -97,7 +109,23 @@ def run(self, step: Step) -> t.Optional[str]: cmd = step.get_launch_cmd() out, err = step.get_output_files() - if isinstance(step, DragonStep): + if isinstance(step, DragonBatchStep) and isinstance( + step.batch_settings, SbatchSettings + ): + # wait for batch step to submit successfully + logger.warning(f"{cmd}, {step.cwd}") + return_code, out, err = self.task_manager.start_and_wait(cmd, step.cwd) + if return_code != 0: + raise LauncherError(f"Sbatch submission failed\n {out}\n {err}") + if out: + slurm_step_id = out.strip() + logger.debug(f"Gleaned batch job id: {step_id} for {step.name}") + + self._slurm_launcher.step_mapping.add( + step.name, slurm_step_id, task_id, step.managed + ) + step_id = "SLURM-" + slurm_step_id + elif isinstance(step, DragonStep): run_args = step.run_settings.run_args env = step.run_settings.env_vars nodes = int(run_args.get("nodes", None) or 1) @@ -142,10 +170,12 @@ def stop(self, step_name: str) -> StepInfo: :rtype: StepInfo """ - self._connector.ensure_connected() - stepmap = self.step_mapping[step_name] step_id = str(stepmap.step_id) + + if step_id.startswith("SLURM-"): + return self._slurm_launcher.stop(step_name.split("-", maxsplit=1)[1]) + _assert_schema_type( self._connector.send_request(DragonStopRequest(step_id=step_id)), DragonStopResponse, @@ -169,37 +199,62 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: :rtype: list[StepInfo] """ - response = _assert_schema_type( - self._connector.send_request(DragonUpdateStatusRequest(step_ids=step_ids)), - DragonUpdateStatusResponse, - ) + step_id_updates: dict[str, StepInfo] = {} - # create StepInfo objects to return - updates: t.List[StepInfo] = [] - # Order matters as we return an ordered list of StepInfo objects + dragon_step_ids = [] + slurm_step_ids = [] for step_id in step_ids: - if step_id not in response.statuses: - msg = "Missing step id update from Dragon launcher." - if response.error_message is not None: - msg += "\nDragon backend reported following error: " - msg += response.error_message - raise LauncherError(msg) - - status, ret_codes = response.statuses[step_id] - if ret_codes: - grp_ret_code = min(ret_codes) - if any(ret_codes): - _err_msg = ( - f"One or more processes failed for job {step_id}" - f"Return codes were: {ret_codes}" - ) - logger.error(_err_msg) + if step_id.startswith("SLURM-"): + print(step_id.split("-", maxsplit=1)[1]) + slurm_step_ids.append(step_id) else: - grp_ret_code = None - info = StepInfo(status, str(status), grp_ret_code) + dragon_step_ids.append(step_id) + + if slurm_step_ids: + # pylint: disable-next=protected-access + slurm_updates = self._slurm_launcher._get_managed_step_update( + [step_id.split("-", maxsplit=1)[1] for step_id in slurm_step_ids] + ) + step_id_updates.update(dict(zip(slurm_step_ids, slurm_updates))) - updates.append(info) - return updates + if dragon_step_ids: + response = _assert_schema_type( + self._connector.send_request( + DragonUpdateStatusRequest(step_ids=dragon_step_ids) + ), + DragonUpdateStatusResponse, + ) + + for step_id in step_ids: + if step_id not in response.statuses: + msg = "Missing step id update from Dragon launcher." + if response.error_message is not None: + msg += "\nDragon backend reported following error: " + msg += response.error_message + logger.error(msg) + info = StepInfo( + SmartSimStatus.STATUS_FAILED, + str(SmartSimStatus.STATUS_FAILED), + -1, + ) + else: + status, ret_codes = response.statuses[step_id] + if ret_codes: + grp_ret_code = min(ret_codes) + if any(ret_codes): + _err_msg = ( + f"One or more processes failed for job {step_id}" + f"Return codes were: {ret_codes}" + ) + logger.error(_err_msg) + else: + grp_ret_code = None + info = StepInfo(status, str(status), grp_ret_code) + + step_id_updates[step_id] = info + + # Order matters as we return an ordered list of StepInfo objects + return [step_id_updates[step_id] for step_id in step_ids] def __str__(self) -> str: return "Dragon" diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index a25e62806..0ae327030 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -138,6 +138,7 @@ def run(self, step: Step) -> t.Optional[str]: # Launch a batch step with Slurm if isinstance(step, SbatchStep): # wait for batch step to submit successfully + print(cmd_list, step.cwd) return_code, out, err = self.task_manager.start_and_wait(cmd_list, step.cwd) if return_code != 0: raise LauncherError(f"Sbatch submission failed\n {out}\n {err}") diff --git a/smartsim/_core/launcher/step/__init__.py b/smartsim/_core/launcher/step/__init__.py index d9c4d35bb..c492f3e97 100644 --- a/smartsim/_core/launcher/step/__init__.py +++ b/smartsim/_core/launcher/step/__init__.py @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from .alpsStep import AprunStep -from .dragonStep import DragonStep +from .dragonStep import DragonBatchStep, DragonStep from .localStep import LocalStep from .lsfStep import BsubBatchStep, JsrunStep from .mpiStep import MpiexecStep, MpirunStep, OrterunStep diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index 38ed36401..90d8d752f 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -24,11 +24,21 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import json +import os import shutil +import sys import typing as t +from ...._core.schemas.dragonRequests import DragonRunRequest, request_registry +from ....error.errors import SSUnsupportedError from ....log import get_logger -from ....settings import DragonRunSettings, Singularity +from ....settings import ( + DragonRunSettings, + QsubBatchSettings, + SbatchSettings, + Singularity, +) from .step import Step logger = get_logger(__name__) @@ -90,3 +100,150 @@ def _get_exe_args_list(run_setting: DragonRunSettings) -> t.List[str]: exe_args = run_setting.exe_args args: t.List[str] = exe_args if isinstance(exe_args, list) else [exe_args] return args + + +class DragonBatchStep(Step): + def __init__( + self, + name: str, + cwd: str, + batch_settings: t.Union[SbatchSettings, QsubBatchSettings], + ) -> None: + """Initialize a Slurm Sbatch step + + :param name: name of the entity to launch + :type name: str + :param cwd: path to launch dir + :type cwd: str + :param batch_settings: batch settings for entity + :type batch_settings: SbatchSettings + """ + super().__init__(name, cwd, batch_settings) + self.steps: t.List[Step] = [] + self.managed = True + self.batch_settings = batch_settings + self._request_file_name = "requests.json" + + def get_launch_cmd(self) -> t.List[str]: + """Get the launch command for the batch + + :return: launch command for the batch + :rtype: list[str] + """ + if isinstance(self.batch_settings, SbatchSettings): + script = self._write_sbatch_script() + return [self.batch_settings.batch_cmd, "--parsable", script] + if isinstance(self.batch_settings, QsubBatchSettings): + script = self._write_qsub_script() + return [self.batch_settings.batch_cmd, script] + + raise SSUnsupportedError( + "DragonBatchStep only support SbatchSettings and QsubBatchSettings" + ) + + def add_to_batch(self, step: Step) -> None: + """Add a job step to this batch + + :param step: a job step instance e.g. DragonStep + :type step: Step + """ + self.steps.append(step) + logger.debug(f"Added step command to batch for {step.name}") + + @staticmethod + def _dragon_entrypoint_cmd(request_file: str) -> str: + """Return command needed to run the Dragon entrypoint""" + cmd = [ + f"{sys.executable}", + "-m", + "smartsim._core.entrypoints.dragon_client", + "--submit", + f"{request_file}", + "\n", + ] + return " ".join(cmd) + + def _write_request_file(self) -> str: + """Write json file with requests to submit to Dragon server""" + request_file = self.get_step_file( + ending="json", script_name=self._request_file_name + ) + requests = [] + for step in self.steps: + run_settings = t.cast(DragonRunSettings, step.step_settings) + run_args = run_settings.run_args + env = run_settings.env_vars + nodes = int(run_args.get("nodes", None) or 1) + tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) + + cmd = step.get_launch_cmd() + out, err = step.get_output_files() + request = DragonRunRequest( + exe=cmd[0], + exe_args=cmd[1:], + path=step.cwd, + name=step.name, + nodes=nodes, + tasks_per_node=tasks_per_node, + env=env, + current_env=os.environ, + output_file=out, + error_file=err, + ) + requests.append(request_registry.to_string(request)) + with open(request_file, "w", encoding="utf-8") as script_file: + script_file.write(json.dumps(requests)) + + return request_file + + def _write_sbatch_script(self) -> str: + """Write the batch script + + :return: batch script path after writing + :rtype: str + """ + batch_script = self.get_step_file(ending=".sh") + output, error = self.get_output_files() + request_file = self._write_request_file() + with open(batch_script, "w", encoding="utf-8") as script_file: + script_file.write("#!/bin/bash\n\n") + script_file.write(f"#SBATCH --output={output}\n") + script_file.write(f"#SBATCH --error={error}\n") + script_file.write(f"#SBATCH --job-name={self.name}\n") + + # add additional sbatch options + for opt in self.batch_settings.format_batch_args(): + script_file.write(f"#SBATCH {opt}\n") + + for cmd in self.batch_settings.preamble: + script_file.write(f"{cmd}\n") + + script_file.write(DragonBatchStep._dragon_entrypoint_cmd(request_file)) + return batch_script + + def _write_qsub_script(self) -> str: + """Write the batch script + + :return: batch script path after writing + :rtype: str + """ + batch_script = self.get_step_file(ending=".sh") + output, error = self.get_output_files() + request_file = self._write_request_file() + with open(batch_script, "w", encoding="utf-8") as script_file: + script_file.write("#!/bin/bash\n\n") + script_file.write(f"#PBS -o {output}\n") + script_file.write(f"#PBS -e {error}\n") + script_file.write(f"#PBS -N {self.name}\n") + script_file.write("#PBS -V \n") + + # add additional sbatch options + for opt in self.batch_settings.format_batch_args(): + script_file.write(f"#PBS {opt}\n") + + for cmd in self.batch_settings.preamble: + script_file.write(f"{cmd}\n") + + script_file.write(DragonBatchStep._dragon_entrypoint_cmd(request_file)) + + return batch_script diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index 60890c896..1e0b475f6 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -86,8 +86,10 @@ def create_batch_settings( "lsf": BsubBatchSettings, } - if launcher == "auto": + if launcher in ["auto", "dragon"]: launcher = detect_launcher() + if launcher == "dragon": + by_launcher["dragon"] = by_launcher["launcher"] if launcher == "local": raise SmartSimError("Local launcher does not support batch workloads") diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 3487ca81c..ac9a8651f 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -63,8 +63,8 @@ def test_batch_model(fileutils, test_dir, wlmutils): model = exp.create_model( "model", path=test_dir, run_settings=run_settings, batch_settings=batch_settings ) - model.set_path(test_dir) + exp.generate(model) exp.start(model, block=True) statuses = exp.get_status(model) assert len(statuses) == 1 @@ -91,6 +91,7 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): ensemble.add_model(M2) ensemble.set_path(test_dir) + exp.generate(ensemble) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) From cf9f0d4e59c9c23c1cca2abfcb8c86c4ecdfbe31 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sun, 7 Apr 2024 18:40:58 -0500 Subject: [PATCH 004/101] Add thread to backend, make slurm batch job start --- conftest.py | 17 +-- smartsim/_core/config/config.py | 5 +- smartsim/_core/entrypoints/dragon.py | 22 +++- smartsim/_core/entrypoints/dragon_client.py | 103 ++++++++++++++++++ .../_core/launcher/dragon/dragonBackend.py | 24 +++- .../_core/launcher/dragon/dragonConnector.py | 48 ++------ .../_core/launcher/dragon/dragonLauncher.py | 80 ++++++++++---- smartsim/_core/launcher/step/dragonStep.py | 8 +- smartsim/_core/schemas/dragonRequests.py | 6 +- smartsim/experiment.py | 4 +- tests/full_wlm/test_generic_batch_launch.py | 2 +- tests/on_wlm/test_dragon.py | 12 +- tests/test_controller_errors.py | 16 ++- tests/test_dragon_launcher.py | 8 +- 14 files changed, 250 insertions(+), 105 deletions(-) create mode 100644 smartsim/_core/entrypoints/dragon_client.py diff --git a/conftest.py b/conftest.py index 43f0fc773..be2cf2300 100644 --- a/conftest.py +++ b/conftest.py @@ -45,6 +45,7 @@ import smartsim from smartsim import Experiment +from smartsim._core.launcher.dragon.dragonConnector import DragonConnector from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher from smartsim._core.config import CONFIG from smartsim._core.config.config import Config @@ -152,7 +153,7 @@ def pytest_sessionfinish( Called after whole test run finished, right before returning the exit status to the system. """ - if exitstatus == 0: + if False and exitstatus == 0: cleanup_attempts = 5 while cleanup_attempts > 0: try: @@ -780,16 +781,10 @@ def global_dragon_teardown() -> None: """ if test_launcher != "dragon" or CONFIG.dragon_server_path is None: return - exp_path = os.path.join(test_output_root, "dragon_teardown") - os.makedirs(exp_path, exist_ok=True) - exp: Experiment = Experiment("dragon_shutdown", exp_path=exp_path, launcher=test_launcher) - rs = exp.create_run_settings("sleep", ["0.1"]) - model = exp.create_model("dummy", run_settings=rs) - exp.generate(model, overwrite=True) - exp.start(model, block=True) - - launcher: DragonLauncher = exp._control._launcher - launcher.cleanup() + logger.debug(f"Tearing down Dragon infrastructure, server path: {CONFIG.dragon_server_path}") + dragon_connector = DragonConnector() + dragon_connector.ensure_connected() + dragon_connector.cleanup() time.sleep(5) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 4a16b6def..f3b1df72a 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -155,7 +155,10 @@ def database_file_parse_interval(self) -> int: @property def dragon_server_path(self) -> t.Optional[str]: - return os.getenv("SMARTSIM_DRAGON_SERVER_PATH") + return os.getenv( + "SMARTSIM_DRAGON_SERVER_PATH", + os.getenv("SMARTSIM_DRAGON_SERVER_PATH_EXP", None), + ) @property def dragon_server_reconnect_timeout(self) -> int: diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 4706e8c5e..3c3c6507e 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -40,7 +40,7 @@ from smartsim._core.launcher.dragon.dragonBackend import DragonBackend from smartsim._core.schemas import DragonBootstrapRequest, DragonBootstrapResponse from smartsim._core.utils.network import get_best_interface_and_address -from smartsim.log import get_logger +from smartsim.log import ContextThread, get_logger logger = get_logger("Dragon Server") @@ -97,6 +97,11 @@ def run( dragon_head_socket.bind(dragon_head_address) dragon_backend = DragonBackend(pid=dragon_pid) + backend_updater = ContextThread( + name="JobManager", daemon=True, target=dragon_backend.update + ) + backend_updater.start() + server = dragonSockets.as_server(dragon_head_socket) logger.debug(f"Listening to {dragon_head_address}") @@ -105,9 +110,11 @@ def run( req = server.recv() logger.debug(f"Received {type(req).__name__} {req}") except zmq.Again: - # dragon_backend.print_status() - dragon_backend.update() - continue + if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): + logger.debug(f"Listening to {dragon_head_address}") + continue + logger.info("Shutdown has been requested") + break resp = dragon_backend.process_request(req) @@ -117,14 +124,17 @@ def run( except zmq.Again: logger.error("Could not send response back to launcher.") - dragon_backend.update() - dragon_backend.print_status() if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): logger.debug(f"Listening to {dragon_head_address}") else: logger.info("Shutdown has been requested") break + try: + del backend_updater + except Exception: + logger.debug("Could not delete backend updater thread") + def main(args: argparse.Namespace, zmq_context: zmq.Context[t.Any]) -> int: if_config = get_best_interface_and_address() diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py new file mode 100644 index 000000000..14b1495d9 --- /dev/null +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -0,0 +1,103 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterpris +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import json +import os +import signal +import sys +import time +import typing as t + +import psutil + +from smartsim._core.launcher.dragon.dragonConnector import DragonConnector +from smartsim._core.schemas import ( + DragonRequest, + DragonShutdownRequest, + request_registry, +) +from smartsim.log import get_logger + +SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] + +logger = get_logger("Dragon Client") + + +def cleanup() -> None: + logger.debug("Cleaning up") + + +def main(args: argparse.Namespace) -> int: + + requests: t.List[DragonRequest] = [] + + with open(args.submit, "r", encoding="utf-8") as request_file: + req_strings = json.load(fp=request_file) + + for req_str in req_strings: + requests.append(request_registry.from_string(req_str)) + + requests.append(DragonShutdownRequest(immediate=False)) + + connector = DragonConnector() + + for request in requests: + response = connector.send_request(request) + if response.error_message is not None: + logger.error(response.error_message) + + logger.info("Terminated sending requests, waiting for Dragon Server to complete") + + # pylint: disable-next=protected-access + if connector._dragon_head_pid is None: + logger.error( + "Could not get Dragon Server PID and will not be able to monitor it." + ) + return 1 + + while True: + # pylint: disable-next=protected-access + if psutil.pid_exists(connector._dragon_head_pid): + time.sleep(1) + else: + break + + return 0 + + +if __name__ == "__main__": + os.environ["PYTHONUNBUFFERED"] = "1" + logger.info("Dragon server started") + + parser = argparse.ArgumentParser( + prefix_chars="+", + description="SmartSim Dragon Client Process, to be used in batch scripts", + ) + parser.add_argument("+submit", type=str, help="Path to request file", required=True) + args_ = parser.parse_args() + + sys.exit(main(args_)) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 63e0ba018..ba1edd4b6 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -26,6 +26,7 @@ import collections import functools +import time import typing as t from dataclasses import dataclass, field from threading import RLock @@ -123,6 +124,7 @@ def __init__(self, pid: int) -> None: num_hosts = len(self._hosts) host_string = str(num_hosts) + (" hosts" if num_hosts > 1 else " host") self._shutdown_requested = False + self._can_shutdown = False self._updates = 0 print(f"{host_string} available for execution: {self._hosts}") @@ -138,7 +140,7 @@ def print_status(self) -> None: @property def should_shutdown(self) -> bool: - return self._shutdown_requested + return self._shutdown_requested and self._can_shutdown def _initialize_hosts(self) -> None: with self._hostlist_lock: @@ -349,11 +351,21 @@ def _refresh_statuses(self) -> None: self._allocated_hosts.pop(host) self._free_hosts.append(host) - def update(self) -> None: - self._updates += 1 + def _update_shutdown_status(self) -> None: + self._can_shutdown = all( + grp_info.status in TERMINAL_STATUSES + for grp_info in self._group_infos.values() + ) - self._start_steps() - self._refresh_statuses() + def update(self) -> None: + while True: + self._updates += 1 + self._start_steps() + self._refresh_statuses() + self._update_shutdown_status() + time.sleep(0.1) + if (self._updates % 100) == 0: + self.print_status() @process_request.register def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: @@ -395,4 +407,6 @@ def _(self, request: DragonHandshakeRequest) -> DragonHandshakeResponse: # pylint: disable-next=no-self-use,unused-argument def _(self, request: DragonShutdownRequest) -> DragonShutdownResponse: self._shutdown_requested = True + self._update_shutdown_status() + self._can_shutdown |= request.immediate return DragonShutdownResponse() diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 867cff055..d524d841f 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -39,6 +39,7 @@ from pathlib import Path from threading import RLock +import psutil import zmq from smartsim._core.launcher.dragon import dragonSockets @@ -86,10 +87,8 @@ def __init__(self) -> None: # Returned by dragon head, useful if shutdown is to be requested # but process was started by another connector self._dragon_head_pid: t.Optional[int] = None - self._dragon_server_path = os.getenv( - "SMARTSIM_DRAGON_SERVER_PATH_EXP", - os.getenv("SMARTSIM_DRAGON_SERVER_PATH", None), - ) + self._dragon_server_path = CONFIG.dragon_server_path + logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") if self._dragon_server_path is None: raise SmartSimError( "DragonConnector could not find the dragon server path. " @@ -268,6 +267,7 @@ def cleanup(self) -> None: server_socket=self._dragon_head_socket, server_process_pid=self._dragon_head_pid, ) + time.sleep(1) def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse: self.ensure_connected() @@ -314,38 +314,10 @@ def _send_req_with_socket( client = dragonSockets.as_client(socket) with DRG_LOCK: logger.debug(f"Sending {type(request).__name__}: {request}") - send_trials = 5 - while send_trials: - try: - client.send(request, flags) - break - except zmq.Again as e: - send_trials -= 1 - logger.debug( - "Could not send request to Dragon server in " - f"{int(client.socket.getsockopt(zmq.SNDTIMEO))/1000} seconds" - ) - if send_trials < 1: - raise e + client.send(request, flags) time.sleep(0.1) - receive_trials = 5 - response = None - while receive_trials: - try: - response = client.recv() - break - except zmq.Again as e: - receive_trials -= 1 - logger.debug( - "Did not receive response from Dragon server in " - f"{int(client.socket.getsockopt(zmq.RCVTIMEO))/1000} seconds" - ) - if receive_trials < 1: - raise e - - if response is None: - raise SmartSimError("Could not receive response from Dragon server") + response = client.recv() logger.debug(f"Received {type(response).__name__}: {response}") return response @@ -358,6 +330,8 @@ def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: def _dragon_cleanup(server_socket: zmq.Socket[t.Any], server_process_pid: int) -> None: + if not psutil.pid_exists(server_process_pid): + return try: # pylint: disable-next=protected-access DragonConnector._send_req_with_socket(server_socket, DragonShutdownRequest()) @@ -366,13 +340,15 @@ def _dragon_cleanup(server_socket: zmq.Socket[t.Any], server_process_pid: int) - print("Could not send shutdown request to dragon server") print(f"ZMQ error: {e}", flush=True) finally: - time.sleep(1) + time.sleep(5) try: - os.kill(server_process_pid, signal.SIGINT) + os.kill(server_process_pid, signal.SIGKILL) print("Sent SIGINT to dragon server") except ProcessLookupError: # Can't use the logger as I/O file may be closed print("Dragon server is not running.", flush=True) + finally: + time.sleep(5) def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 69802cd8e..d61f54384 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -48,8 +48,9 @@ DragonUpdateStatusResponse, ) from ..launcher import WLMLauncher +from ..pbs.pbsLauncher import PBSLauncher from ..slurm.slurmLauncher import SlurmLauncher -from ..step import DragonBatchStep, DragonStep, LocalStep, Step +from ..step import DragonBatchStep, DragonStep, Step from ..stepInfo import StepInfo from .dragonConnector import DragonConnector, _SchemaT @@ -71,6 +72,7 @@ def __init__(self) -> None: super().__init__() self._connector = DragonConnector() self._slurm_launcher = SlurmLauncher() + self._pbs_launcher = PBSLauncher() @property def is_connected(self) -> bool: @@ -87,7 +89,7 @@ def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: DragonRunSettings: DragonStep, SbatchSettings: DragonBatchStep, QsubBatchSettings: DragonBatchStep, - RunSettings: LocalStep, + RunSettings: DragonStep, } def run(self, step: Step) -> t.Optional[str]: @@ -109,22 +111,32 @@ def run(self, step: Step) -> t.Optional[str]: cmd = step.get_launch_cmd() out, err = step.get_output_files() - if isinstance(step, DragonBatchStep) and isinstance( - step.batch_settings, SbatchSettings - ): - # wait for batch step to submit successfully - logger.warning(f"{cmd}, {step.cwd}") - return_code, out, err = self.task_manager.start_and_wait(cmd, step.cwd) - if return_code != 0: - raise LauncherError(f"Sbatch submission failed\n {out}\n {err}") - if out: - slurm_step_id = out.strip() - logger.debug(f"Gleaned batch job id: {step_id} for {step.name}") - - self._slurm_launcher.step_mapping.add( - step.name, slurm_step_id, task_id, step.managed - ) - step_id = "SLURM-" + slurm_step_id + if isinstance(step, DragonBatchStep): + if isinstance(step.batch_settings, SbatchSettings): + # wait for batch step to submit successfully + return_code, out, err = self.task_manager.start_and_wait(cmd, step.cwd) + if return_code != 0: + raise LauncherError(f"Sbatch submission failed\n {out}\n {err}") + if out: + slurm_step_id = out.strip() + logger.debug(f"Gleaned batch job id: {step_id} for {step.name}") + + self._slurm_launcher.step_mapping.add( + step.name, slurm_step_id, task_id, step.managed + ) + step_id = "SLURM-" + slurm_step_id + elif isinstance(step.batch_settings, QsubBatchSettings): + # wait for batch step to submit successfully + return_code, out, err = self.task_manager.start_and_wait(cmd, step.cwd) + if return_code != 0: + raise LauncherError(f"Qsub batch submission failed\n {out}\n {err}") + if out: + pbs_step_id = out.strip() + logger.debug(f"Gleaned batch job id: {step_id} for {step.name}") + self._pbs_launcher.step_mapping.add( + step.name, pbs_step_id, task_id, step.managed + ) + step_id = "PBS-" + pbs_step_id elif isinstance(step, DragonStep): run_args = step.run_settings.run_args env = step.run_settings.env_vars @@ -156,6 +168,7 @@ def run(self, step: Step) -> t.Optional[str]: task_id = self.task_manager.start_task( cmd, step.cwd, step.env, out=out_strm.fileno(), err=err_strm.fileno() ) + step.managed = False self.step_mapping.add(step.name, step_id, task_id, step.managed) @@ -174,7 +187,12 @@ def stop(self, step_name: str) -> StepInfo: step_id = str(stepmap.step_id) if step_id.startswith("SLURM-"): - return self._slurm_launcher.stop(step_name.split("-", maxsplit=1)[1]) + return self._slurm_launcher.stop( + DragonLauncher._unprefix_step_id(step_name) + ) + + if step_id.startswith("PBS-"): + return self._pbs_launcher.stop(DragonLauncher._unprefix_step_id(step_name)) _assert_schema_type( self._connector.send_request(DragonStopRequest(step_id=step_id)), @@ -190,6 +208,10 @@ def stop(self, step_name: str) -> StepInfo: ) return step_info + @staticmethod + def _unprefix_step_id(step_id: str) -> str: + return step_id.split("-", maxsplit=1)[1] + def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: """Get step updates for Dragon-managed jobs @@ -201,22 +223,34 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: step_id_updates: dict[str, StepInfo] = {} - dragon_step_ids = [] - slurm_step_ids = [] + dragon_step_ids: t.List[str] = [] + slurm_step_ids: t.List[str] = [] + pbs_step_ids: t.List[str] = [] for step_id in step_ids: if step_id.startswith("SLURM-"): - print(step_id.split("-", maxsplit=1)[1]) slurm_step_ids.append(step_id) + elif step_id.startswith("PBS-"): + pbs_step_ids.append(step_id) else: dragon_step_ids.append(step_id) if slurm_step_ids: # pylint: disable-next=protected-access slurm_updates = self._slurm_launcher._get_managed_step_update( - [step_id.split("-", maxsplit=1)[1] for step_id in slurm_step_ids] + [ + DragonLauncher._unprefix_step_id(step_id) + for step_id in slurm_step_ids + ] ) step_id_updates.update(dict(zip(slurm_step_ids, slurm_updates))) + if pbs_step_ids: + # pylint: disable-next=protected-access + pbs_updates = self._pbs_launcher._get_managed_step_update( + [DragonLauncher._unprefix_step_id(step_id) for step_id in pbs_step_ids] + ) + step_id_updates.update(dict(zip(pbs_step_ids, pbs_updates))) + if dragon_step_ids: response = _assert_schema_type( self._connector.send_request( diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index 90d8d752f..587e52f20 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -157,7 +157,7 @@ def _dragon_entrypoint_cmd(request_file: str) -> str: f"{sys.executable}", "-m", "smartsim._core.entrypoints.dragon_client", - "--submit", + "+submit", f"{request_file}", "\n", ] @@ -215,6 +215,10 @@ def _write_sbatch_script(self) -> str: for opt in self.batch_settings.format_batch_args(): script_file.write(f"#SBATCH {opt}\n") + script_file.write( + f"#SBATCH --export=ALL,SMARTSIM_DRAGON_SERVER_PATH={self.cwd}\n" + ) + for cmd in self.batch_settings.preamble: script_file.write(f"{cmd}\n") @@ -241,6 +245,8 @@ def _write_qsub_script(self) -> str: for opt in self.batch_settings.format_batch_args(): script_file.write(f"#PBS {opt}\n") + script_file.write(f"#PBS -v SMARTSIM_DRAGON_SERVER_PATH={self.cwd}\n") + for cmd in self.batch_settings.preamble: script_file.write(f"{cmd}\n") diff --git a/smartsim/_core/schemas/dragonRequests.py b/smartsim/_core/schemas/dragonRequests.py index 58d4936e4..e134015b4 100644 --- a/smartsim/_core/schemas/dragonRequests.py +++ b/smartsim/_core/schemas/dragonRequests.py @@ -82,4 +82,8 @@ class DragonBootstrapRequest(DragonRequest): @request_registry.register("shutdown") -class DragonShutdownRequest(DragonRequest): ... +class DragonShutdownRequest(DragonRequest): + # Whether the server should shut down immediately + # setting this to False means that the server will + # shut down when all jobs are terminated. + immediate: bool = True diff --git a/smartsim/experiment.py b/smartsim/experiment.py index e79b252f6..25dce2867 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -179,7 +179,9 @@ def __init__( def _set_dragon_server_path(self) -> None: """Set path for dragon server through environment varialbes""" if not "SMARTSIM_DRAGON_SERVER_PATH" in environ: - environ["SMARTSIM_DRAGON_SERVER_PATH_EXP"] = self.exp_path + environ["SMARTSIM_DRAGON_SERVER_PATH_EXP"] = osp.join( + self.exp_path, CONFIG.dragon_default_subdir + ) @_contextualize def start( diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index ac9a8651f..dc9878d6c 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -55,7 +55,7 @@ def test_batch_model(fileutils, test_dir, wlmutils): exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") - batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") + batch_settings = exp.create_batch_settings(nodes=1, time="00:05:00") batch_settings.set_account(wlmutils.get_test_account()) add_batch_resources(wlmutils, batch_settings) diff --git a/tests/on_wlm/test_dragon.py b/tests/on_wlm/test_dragon.py index d9d80bc72..d5f94c639 100644 --- a/tests/on_wlm/test_dragon.py +++ b/tests/on_wlm/test_dragon.py @@ -48,6 +48,8 @@ def test_dragon_global_path(global_dragon_teardown, wlmutils, test_dir, monkeypa exp.start(model, block=True) assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED + launcher: DragonLauncher = exp._control._launcher + launcher.cleanup() def test_dragon_exp_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch): @@ -62,17 +64,15 @@ def test_dragon_exp_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch exp.generate(model) exp.start(model, block=True) + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED launcher: DragonLauncher = exp._control._launcher launcher.cleanup() - assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED - -def test_dragon_cannot_honor(wlmutils, test_dir, monkeypatch): - monkeypatch.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) +def test_dragon_cannot_honor(wlmutils, test_dir): exp: Experiment = Experiment( - "test_dragon_cannott_honor", + "test_dragon_cannot_honor", exp_path=test_dir, launcher=wlmutils.get_test_launcher(), ) @@ -84,3 +84,5 @@ def test_dragon_cannot_honor(wlmutils, test_dir, monkeypatch): exp.start(model, block=True) assert exp.get_status(model)[0] == SmartSimStatus.STATUS_FAILED + launcher: DragonLauncher = exp._control._launcher + launcher.cleanup() diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index db34bf7d1..60f35882d 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -164,9 +164,8 @@ def test_restarting_entity(test_dir, wlmutils, entity): step = MockStep("mock-step", test_dir, step_settings) test_launcher = wlmutils.get_test_launcher() controller = Controller(test_launcher) - if test_launcher == "dragon": - step = DragonStep("mock-step", test_dir, step_settings) - controller._launcher.connect_to_dragon(test_dir) + # if test_launcher == "dragon": + # step = DragonStep("mock-step", test_dir, step_settings) controller._jobs.add_job(entity.name, job_id="1234", entity=entity) controller._jobs.move_to_completed(controller._jobs.jobs.get(entity.name)) controller._launch_step(step, entity=entity) @@ -178,9 +177,8 @@ def test_restarting_orch(test_dir, wlmutils): step = MockStep("mock-step", test_dir, step_settings) test_launcher = wlmutils.get_test_launcher() controller = Controller(test_launcher) - if test_launcher == "dragon": - step = DragonStep("mock-step", test_dir, step_settings) - controller._launcher.connect_to_dragon(test_dir) + # if test_launcher == "dragon": + # step = DragonStep("mock-step", test_dir, step_settings) controller._jobs.add_job(orc.name, job_id="1234", entity=orc) controller._jobs.move_to_completed(controller._jobs.db_jobs.get(orc.name)) controller._launch_step(step, entity=orc) @@ -198,11 +196,11 @@ def test_starting_entity(test_dir, wlmutils, entity, entity_2): step_settings = RunSettings("echo") step = MockStep("mock-step", test_dir, step_settings) test_launcher = wlmutils.get_test_launcher() - if test_launcher == "dragon": - step = DragonStep("mock-step", test_dir, step_settings) + # if test_launcher == "dragon": + # step = DragonStep("mock-step", test_dir, step_settings) controller = Controller(test_launcher) controller._jobs.add_job(entity.name, job_id="1234", entity=entity) controller._jobs.move_to_completed(controller._jobs.jobs.get(entity.name)) with pytest.raises(SSUnsupportedError) as ex: controller._launch_step(step, entity=entity_2) - assert ex.value.args[0] == "SmartSim entities cannot have duplicate names." + assert ex.value.args[0] == "SmartSim entities cannot have duplicate names." diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 6c29b9fe9..342908423 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -85,11 +85,11 @@ def test_dragon_connect_bind_address(monkeypatch: pytest.MonkeyPatch, test_dir: with monkeypatch.context() as ctx: ctx.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) ctx.setattr( - "smartsim._core.launcher.dragon.dragonLauncher.get_best_interface_and_address", + "smartsim._core.launcher.dragon.dragonConnector.get_best_interface_and_address", lambda: IFConfig(interface="faux_interface", address="127.0.0.1"), ) ctx.setattr( - "smartsim._core.launcher.dragon.dragonLauncher.DragonLauncher._handshake", + "smartsim._core.launcher.dragon.dragonLauncher.DragonConnector._handshake", lambda self, address: ..., ) @@ -99,9 +99,7 @@ def test_dragon_connect_bind_address(monkeypatch: pytest.MonkeyPatch, test_dir: ctx.setattr("subprocess.Popen", lambda *args, **kwargs: MockPopen()) dragon_launcher = DragonLauncher() - with pytest.raises(LauncherError) as ex: - # it will complain about failure to connect when validating... - dragon_launcher.connect_to_dragon(test_dir) + dragon_launcher._connector.connect_to_dragon() chosen_port = int(mock_socket.bind_address.split(":")[-1]) assert chosen_port >= 5995 From 8cee4c544246c95d966dad02ab44ba6f3266ffaf Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 8 Apr 2024 15:02:47 -0500 Subject: [PATCH 005/101] Fix batch launch --- smartsim/_core/entrypoints/dragon.py | 10 +++++++++- smartsim/_core/entrypoints/dragon_client.py | 12 ++++++++---- smartsim/_core/launcher/dragon/dragonBackend.py | 6 ++++++ smartsim/_core/launcher/dragon/dragonConnector.py | 2 +- smartsim/_core/launcher/dragon/dragonLauncher.py | 6 ++---- smartsim/_core/schemas/dragonRequests.py | 3 +++ 6 files changed, 29 insertions(+), 10 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 3c3c6507e..ba30cfe3a 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -31,6 +31,7 @@ import socket import sys import textwrap +import time import typing as t from types import FrameType @@ -127,7 +128,7 @@ def run( if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): logger.debug(f"Listening to {dragon_head_address}") else: - logger.info("Shutdown has been requested") + logger.info("Backend shutdown has been requested") break try: @@ -135,6 +136,12 @@ def run( except Exception: logger.debug("Could not delete backend updater thread") + if not dragon_backend.frontend_shutdown: + logger.info("Frontend will have to be shut down externally") + while True: + time.sleep(1) + logger.info("Waiting for external shutdown") + def main(args: argparse.Namespace, zmq_context: zmq.Context[t.Any]) -> int: if_config = get_best_interface_and_address() @@ -174,6 +181,7 @@ def main(args: argparse.Namespace, zmq_context: zmq.Context[t.Any]) -> int: return os.EX_SOFTWARE logger.info("Shutting down! Bye bye!") + return 0 diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py index 14b1495d9..7b4edb6c4 100644 --- a/smartsim/_core/entrypoints/dragon_client.py +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -32,10 +32,11 @@ import time import typing as t -import psutil +import zmq from smartsim._core.launcher.dragon.dragonConnector import DragonConnector from smartsim._core.schemas import ( + DragonHandshakeRequest, DragonRequest, DragonShutdownRequest, request_registry, @@ -61,7 +62,7 @@ def main(args: argparse.Namespace) -> int: for req_str in req_strings: requests.append(request_registry.from_string(req_str)) - requests.append(DragonShutdownRequest(immediate=False)) + requests.append(DragonShutdownRequest(immediate=False, frontend_shutdown=False)) connector = DragonConnector() @@ -81,11 +82,14 @@ def main(args: argparse.Namespace) -> int: while True: # pylint: disable-next=protected-access - if psutil.pid_exists(connector._dragon_head_pid): + try: time.sleep(1) - else: + connector.send_request(DragonHandshakeRequest()) + except zmq.error.Again: + print("Could not reach server, assuming backend has shut down") break + print("Server has finished.") return 0 diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index ba1edd4b6..0a0a0fedc 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -125,6 +125,7 @@ def __init__(self, pid: int) -> None: host_string = str(num_hosts) + (" hosts" if num_hosts > 1 else " host") self._shutdown_requested = False self._can_shutdown = False + self._frontend_shutdown = None self._updates = 0 print(f"{host_string} available for execution: {self._hosts}") @@ -138,6 +139,10 @@ def print_status(self) -> None: print(f"| {self._updates}: There are {len(self._queued_steps)} queued steps") print("-------------------------------------------------------------\n") + @property + def frontend_shutdown(self) -> bool: + return bool(self._frontend_shutdown) + @property def should_shutdown(self) -> bool: return self._shutdown_requested and self._can_shutdown @@ -409,4 +414,5 @@ def _(self, request: DragonShutdownRequest) -> DragonShutdownResponse: self._shutdown_requested = True self._update_shutdown_status() self._can_shutdown |= request.immediate + self._frontend_shutdown = request.frontend_shutdown return DragonShutdownResponse() diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index d524d841f..81e725c24 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -342,7 +342,7 @@ def _dragon_cleanup(server_socket: zmq.Socket[t.Any], server_process_pid: int) - finally: time.sleep(5) try: - os.kill(server_process_pid, signal.SIGKILL) + os.kill(server_process_pid, signal.SIGTERM) print("Sent SIGINT to dragon server") except ProcessLookupError: # Can't use the logger as I/O file may be closed diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index d61f54384..7d6cd384f 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -187,12 +187,10 @@ def stop(self, step_name: str) -> StepInfo: step_id = str(stepmap.step_id) if step_id.startswith("SLURM-"): - return self._slurm_launcher.stop( - DragonLauncher._unprefix_step_id(step_name) - ) + return self._slurm_launcher.stop(step_name) if step_id.startswith("PBS-"): - return self._pbs_launcher.stop(DragonLauncher._unprefix_step_id(step_name)) + return self._pbs_launcher.stop(step_name) _assert_schema_type( self._connector.send_request(DragonStopRequest(step_id=step_id)), diff --git a/smartsim/_core/schemas/dragonRequests.py b/smartsim/_core/schemas/dragonRequests.py index e134015b4..07f20bc64 100644 --- a/smartsim/_core/schemas/dragonRequests.py +++ b/smartsim/_core/schemas/dragonRequests.py @@ -87,3 +87,6 @@ class DragonShutdownRequest(DragonRequest): # setting this to False means that the server will # shut down when all jobs are terminated. immediate: bool = True + # Whether the frontend will have to shut down + # or wait for external termination + frontend_shutdown: bool = True From ec1395abb5eda556b74133318c9518d4bf43ee0b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 8 Apr 2024 16:36:09 -0500 Subject: [PATCH 006/101] Revert rmtree and mypy --- conftest.py | 2 +- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conftest.py b/conftest.py index be2cf2300..9ca894424 100644 --- a/conftest.py +++ b/conftest.py @@ -153,7 +153,7 @@ def pytest_sessionfinish( Called after whole test run finished, right before returning the exit status to the system. """ - if False and exitstatus == 0: + if exitstatus == 0: cleanup_attempts = 5 while cleanup_attempts > 0: try: diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 0a0a0fedc..fe24e1c0c 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -125,7 +125,7 @@ def __init__(self, pid: int) -> None: host_string = str(num_hosts) + (" hosts" if num_hosts > 1 else " host") self._shutdown_requested = False self._can_shutdown = False - self._frontend_shutdown = None + self._frontend_shutdown: t.Optional[bool] = None self._updates = 0 print(f"{host_string} available for execution: {self._hosts}") From 021dfc4bdd26b60c890a5d9c3f5333c09b3a2c64 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 9 Apr 2024 13:11:25 -0500 Subject: [PATCH 007/101] Fix test --- tests/test_telemetry_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py index 68cdcd0f7..e167a7a7a 100644 --- a/tests/test_telemetry_monitor.py +++ b/tests/test_telemetry_monitor.py @@ -1306,7 +1306,7 @@ def _faux_updates(_self: WLMLauncher, _names: t.List[str]) -> t.List[StepInfo]: ctx.setattr(SlurmLauncher, "get_step_update", get_faux_update(status_in)) mani_handler = ManifestEventHandler("xyz", logger) - mani_handler.set_launcher("slurm", test_dir) + mani_handler.set_launcher("slurm") # prep a fake job to request updates for job_entity = JobEntity() From 6804416880508f54357c38a41777d15295d0e1e0 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 10 Apr 2024 10:58:53 -0500 Subject: [PATCH 008/101] Fix reconnect_orchestrator --- smartsim/_core/config/config.py | 12 ++++++++++-- smartsim/_core/control/controller.py | 2 +- .../_core/launcher/dragon/dragonConnector.py | 4 ++++ .../_core/launcher/dragon/dragonLauncher.py | 19 +++++++++++++++++++ smartsim/_core/launcher/launcher.py | 8 ++++++++ smartsim/_core/launcher/local/local.py | 5 ++++- smartsim/_core/launcher/step/dragonStep.py | 2 +- .../full_wlm/test_generic_orc_launch_batch.py | 2 +- 8 files changed, 48 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index f3b1df72a..74e0b4635 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -147,11 +147,11 @@ def database_cli(self) -> str: @property def database_file_parse_trials(self) -> int: - return int(os.getenv("SMARTSIM_DB_FILE_PARSE_TRIALS", "10")) + return int(os.getenv("SMARTSIM_DB_FILE_PARSE_TRIALS", "100")) @property def database_file_parse_interval(self) -> int: - return int(os.getenv("SMARTSIM_DB_FILE_PARSE_INTERVAL", "2")) + return int(os.getenv("SMARTSIM_DB_FILE_PARSE_INTERVAL", "5")) @property def dragon_server_path(self) -> t.Optional[str]: @@ -172,6 +172,14 @@ def dragon_server_timeout(self) -> int: def dragon_server_startup_timeout(self) -> int: return int(os.getenv("SMARTSIM_DRAGON_STARTUP_TIMEOUT", "-1")) + @property + def dragon_transport(self) -> str: + return os.getenv("SMARTSIM_DRAGON_TRANSPORT", "hsta") + + @property + def dragon_log_level(self) -> str: + return os.getenv("SMARTSIM_DRAGON_LOG_LEVEL", "INFO") + @property def log_level(self) -> str: return os.environ.get("SMARTSIM_LOG_LEVEL", "info") diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index aabbad71f..8c11eedcc 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -806,7 +806,7 @@ def reload_saved_db(self, checkpoint_file: str) -> Orchestrator: try: for db_job, step in job_steps: self._jobs.db_jobs[db_job.ename] = db_job - self._launcher.step_mapping[db_job.name] = step + self._launcher.add_step_to_mapping_table(db_job.name, step) if step.task_id: self._launcher.task_manager.add_existing(int(step.task_id)) except LauncherError as e: diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 81e725c24..4f357f21b 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -170,6 +170,10 @@ def connect_to_dragon(self) -> None: cmd = [ "dragon", + "-t", + CONFIG.dragon_transport, + "-l", + CONFIG.dragon_log_level, sys.executable, "-m", "smartsim._core.entrypoints.dragon", diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 7d6cd384f..f8b0c17fa 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -29,6 +29,7 @@ import os import typing as t +from ...._core.launcher.stepMapping import StepMap from ....error import LauncherError from ....log import get_logger from ....settings import ( @@ -92,6 +93,24 @@ def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: RunSettings: DragonStep, } + def add_step_to_mapping_table(self, name: str, step_map: StepMap): + super().add_step_to_mapping_table(name, step_map) + + if step_map.step_id.startswith("SLURM-"): + slurm_step_map = StepMap( + step_id=DragonLauncher._unprefix_step_id(step_map.step_id), + task_id=step_map.task_id, + managed=step_map.managed, + ) + self._slurm_launcher.add_step_to_mapping_table(name, slurm_step_map) + elif step_map.step_id.startswith("PBS-"): + pbs_step_map = StepMap( + step_id=DragonLauncher._unprefix_step_id(step_map.step_id), + task_id=step_map.task_id, + managed=step_map.managed, + ) + self._pbs_launcher.add_step_to_mapping_table(name, pbs_step_map) + def run(self, step: Step) -> t.Optional[str]: """Run a job step through Slurm diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index 80000c22f..ba79060d9 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -27,6 +27,7 @@ import abc import typing as t +from ..._core.launcher.stepMapping import StepMap from ...error import AllocationError, LauncherError, SSUnsupportedError from ...settings import SettingsBase from .step import Step @@ -69,6 +70,10 @@ def run(self, step: Step) -> t.Optional[str]: def stop(self, step_name: str) -> StepInfo: raise NotImplementedError + @abc.abstractmethod + def add_step_to_mapping_table(self, name: str, step_map: StepMap): + raise NotImplementedError + class WLMLauncher(Launcher): # cov-wlm """The base class for any Launcher that utilizes workload @@ -86,6 +91,9 @@ def __init__(self) -> None: def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: raise NotImplementedError + def add_step_to_mapping_table(self, name: str, step_map: StepMap): + self.step_mapping[name] = step_map + # every launcher utilizing this interface must have a map # of supported RunSettings types (see slurmLauncher.py for ex) def create_step( diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py index 96778ec0d..036ecdf12 100644 --- a/smartsim/_core/launcher/local/local.py +++ b/smartsim/_core/launcher/local/local.py @@ -30,7 +30,7 @@ from ..launcher import Launcher from ..step import LocalStep, Step from ..stepInfo import StepInfo, UnmanagedStepInfo -from ..stepMapping import StepMapping +from ..stepMapping import StepMap, StepMapping from ..taskManager import TaskManager @@ -41,6 +41,9 @@ def __init__(self) -> None: self.task_manager = TaskManager() self.step_mapping = StepMapping() + def add_step_to_mapping_table(self, name: str, step_map: StepMap): + self.step_mapping[name] = step_map + def create_step(self, name: str, cwd: str, step_settings: SettingsBase) -> Step: """Create a job step to launch an entity locally diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index 587e52f20..b26765ff1 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -216,7 +216,7 @@ def _write_sbatch_script(self) -> str: script_file.write(f"#SBATCH {opt}\n") script_file.write( - f"#SBATCH --export=ALL,SMARTSIM_DRAGON_SERVER_PATH={self.cwd}\n" + f"#SBATCH --export=ALL,SMARTSIM_DRAGON_SERVER_PATH={self.cwd},PYTHONUNBUFFERED=1\n" ) for cmd in self.batch_settings.preamble: diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index 293a2cdd2..d097aa439 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -163,7 +163,7 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) - orc.batch_settings.set_walltime("00:03:00") + orc.batch_settings.set_walltime("00:01:00") exp.start(orc, block=True) From e3f006c72657cea7090bbead659a99d791a301da Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 15 Apr 2024 13:21:06 -0500 Subject: [PATCH 009/101] Set NONE for default dragon log level --- smartsim/_core/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 74e0b4635..41575db22 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -178,7 +178,7 @@ def dragon_transport(self) -> str: @property def dragon_log_level(self) -> str: - return os.getenv("SMARTSIM_DRAGON_LOG_LEVEL", "INFO") + return os.getenv("SMARTSIM_DRAGON_LOG_LEVEL", "NONE") @property def log_level(self) -> str: From d47f65a1d478026066c53351dfe55c2ee683ca40 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 15 Apr 2024 14:56:46 -0500 Subject: [PATCH 010/101] Revert config changes --- smartsim/_core/config/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 41575db22..0bc3afff9 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -147,11 +147,11 @@ def database_cli(self) -> str: @property def database_file_parse_trials(self) -> int: - return int(os.getenv("SMARTSIM_DB_FILE_PARSE_TRIALS", "100")) + return int(os.getenv("SMARTSIM_DB_FILE_PARSE_TRIALS", "10")) @property def database_file_parse_interval(self) -> int: - return int(os.getenv("SMARTSIM_DB_FILE_PARSE_INTERVAL", "5")) + return int(os.getenv("SMARTSIM_DB_FILE_PARSE_INTERVAL", "2")) @property def dragon_server_path(self) -> t.Optional[str]: From 782858994accbbf4f70347d07ad5dedca2eb4893 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 16 Apr 2024 05:46:55 -0500 Subject: [PATCH 011/101] Fix tests, enforce SIGTERM to dragon --- smartsim/_core/entrypoints/dragon.py | 2 +- smartsim/_core/launcher/dragon/dragonBackend.py | 6 +++--- smartsim/_core/launcher/dragon/dragonConnector.py | 9 +++++---- smartsim/_core/launcher/dragon/dragonLauncher.py | 3 ++- tests/on_wlm/test_simple_entity_launch.py | 9 +++++++-- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index ba30cfe3a..883f3cc3d 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -99,7 +99,7 @@ def run( dragon_backend = DragonBackend(pid=dragon_pid) backend_updater = ContextThread( - name="JobManager", daemon=True, target=dragon_backend.update + name="DragonBackend", daemon=True, target=dragon_backend.update ) backend_updater.start() diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index fe24e1c0c..0ed59f849 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -362,14 +362,14 @@ def _update_shutdown_status(self) -> None: for grp_info in self._group_infos.values() ) - def update(self) -> None: + def update(self, interval: float=0.01) -> None: while True: self._updates += 1 self._start_steps() self._refresh_statuses() self._update_shutdown_status() - time.sleep(0.1) - if (self._updates % 100) == 0: + time.sleep(0.01) + if (self._updates % int(10/interval)) == 0: self.print_status() @process_request.register diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 4f357f21b..1ad0f714e 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -271,7 +271,6 @@ def cleanup(self) -> None: server_socket=self._dragon_head_socket, server_process_pid=self._dragon_head_pid, ) - time.sleep(1) def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse: self.ensure_connected() @@ -346,13 +345,15 @@ def _dragon_cleanup(server_socket: zmq.Socket[t.Any], server_process_pid: int) - finally: time.sleep(5) try: - os.kill(server_process_pid, signal.SIGTERM) + os.kill(server_process_pid, signal.SIGINT) print("Sent SIGINT to dragon server") + time.sleep(5) + if psutil.pid_exists(server_process_pid): + os.kill(server_process_pid, signal.SIGTERM) except ProcessLookupError: # Can't use the logger as I/O file may be closed print("Dragon server is not running.", flush=True) - finally: - time.sleep(5) + def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index f8b0c17fa..de10e7e37 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -223,6 +223,7 @@ def stop(self, step_name: str) -> StepInfo: step_info.status = ( SmartSimStatus.STATUS_CANCELLED # set status to cancelled instead of failed ) + step_info.launcher_status = str(SmartSimStatus.STATUS_CANCELLED) return step_info @staticmethod @@ -294,7 +295,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: grp_ret_code = min(ret_codes) if any(ret_codes): _err_msg = ( - f"One or more processes failed for job {step_id}" + f"One or more processes failed for job {step_id} " f"Return codes were: {ret_codes}" ) logger.error(_err_msg) diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index 6632b6816..09fca3dd9 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -82,7 +82,7 @@ def test_multinode_app(mpi_app_path, test_dir, wlmutils): exp.start(model, block=True) p = Path(model.path) - output_files = sorted([str(path) for path in p.glob("*")]) + output_files = sorted([str(path) for path in p.glob("mpi_hello*")]) expected_files = sorted( [os.path.join(model.path, f"mpi_hello.{idx}.log") for idx in range(3)] ) @@ -140,13 +140,18 @@ def test_summary(fileutils, test_dir, wlmutils): rows = [s.split() for s in summary_str.split("\n")] headers = ["Index"] + rows.pop(0) + # There is no guarantee that the order of + # the rows will be sleep, bad row = dict(zip(headers, rows[0])) + row_1 = dict(zip(headers, rows[1])) + if row["Name"] != sleep.name: + row_1, row = row, row_1 + assert sleep.name == row["Name"] assert sleep.type == row["Entity-Type"] assert 0 == int(row["RunID"]) assert 0 == int(row["Returncode"]) - row_1 = dict(zip(headers, rows[1])) assert bad.name == row_1["Name"] assert bad.type == row_1["Entity-Type"] assert 0 == int(row_1["RunID"]) From 8b6534df81a124f9776e956a27e0d0affd6accab Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 16 Apr 2024 06:04:53 -0500 Subject: [PATCH 012/101] Post-merge fix to Connector --- .../_core/launcher/dragon/dragonConnector.py | 59 ++++++++++++++----- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 1ad0f714e..2ed932469 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -87,6 +87,7 @@ def __init__(self) -> None: # Returned by dragon head, useful if shutdown is to be requested # but process was started by another connector self._dragon_head_pid: t.Optional[int] = None + self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None self._dragon_server_path = CONFIG.dragon_server_path logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") if self._dragon_server_path is None: @@ -103,7 +104,9 @@ def is_connected(self) -> bool: return self._dragon_head_socket is not None def _handshake(self, address: str) -> None: - self._dragon_head_socket = self._context.socket(zmq.REQ) + self._dragon_head_socket, self._authenticator = dragonSockets.get_secure_socket( + self._context, zmq.REQ, False, self._authenticator + ) self._dragon_head_socket.connect(address) try: dragon_handshake = _assert_schema_type( @@ -184,8 +187,10 @@ def connect_to_dragon(self) -> None: connector_socket: t.Optional[zmq.Socket[t.Any]] = None if address is not None: self._set_timeout(self._startup_timeout) - connector_socket = self._context.socket(zmq.REP) + connector_socket, self._authenticator = dragonSockets.get_secure_socket( + self._context, zmq.REP, True, self._authenticator + ) # find first available port >= 5995 port = find_free_port(start=5995) socket_addr = f"tcp://{address}:{port}" @@ -259,6 +264,7 @@ def log_dragon_outputs() -> None: _dragon_cleanup, server_socket=server_socket, server_process_pid=server_process_pid, + server_authenticator=self._authenticator, ) else: # TODO parse output file @@ -270,6 +276,7 @@ def cleanup(self) -> None: _dragon_cleanup( server_socket=self._dragon_head_socket, server_process_pid=self._dragon_head_pid, + server_authenticator=self._authenticator, ) def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse: @@ -332,28 +339,48 @@ def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: return obj -def _dragon_cleanup(server_socket: zmq.Socket[t.Any], server_process_pid: int) -> None: - if not psutil.pid_exists(server_process_pid): - return +def _dragon_cleanup( + server_socket: t.Optional[zmq.Socket[t.Any]] = None, + server_process_pid: t.Optional[int] = 0, + server_authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None, +) -> None: + """Clean up resources used by the launcher. + :param server_socket: (optional) Socket used to connect to dragon environment + :type server_socket: Optional[zmq.Socket] + :param server_process_pid: (optional) Process ID of the dragon entrypoint + :type server_process_pid: Optional[int] + :param server_authenticator: (optional) Authenticator used to secure sockets + :type server_authenticator: Optional[zmq.auth.thread.ThreadAuthenticator] + """ try: - # pylint: disable-next=protected-access - DragonConnector._send_req_with_socket(server_socket, DragonShutdownRequest()) + if server_socket is not None: + DragonConnector.send_req_with_socket(server_socket, DragonShutdownRequest()) except zmq.error.ZMQError as e: # Can't use the logger as I/O file may be closed print("Could not send shutdown request to dragon server") print(f"ZMQ error: {e}", flush=True) finally: + time.sleep(1) + + if not psutil.pid_exists(server_process_pid) or not server_process_pid: + return + + try: + os.kill(server_process_pid, signal.SIGINT) + print("Sent SIGINT to dragon server") time.sleep(5) - try: - os.kill(server_process_pid, signal.SIGINT) - print("Sent SIGINT to dragon server") - time.sleep(5) - if psutil.pid_exists(server_process_pid): - os.kill(server_process_pid, signal.SIGTERM) - except ProcessLookupError: - # Can't use the logger as I/O file may be closed - print("Dragon server is not running.", flush=True) + if psutil.pid_exists(server_process_pid): + os.kill(server_process_pid, signal.SIGTERM) + except ProcessLookupError: + # Can't use the logger as I/O file may be closed + print("Dragon server is not running.", flush=True) + + try: + if server_authenticator is not None: + server_authenticator.stop() + except Exception: + print("Authenticator shutdown error") def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: From 64ded594c60ed127d10c63b1f41b38c5754728f6 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 16 Apr 2024 06:08:20 -0500 Subject: [PATCH 013/101] Another post-merge fix --- smartsim/_core/launcher/dragon/dragonLauncher.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index ebf32f30b..de10e7e37 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -75,8 +75,6 @@ def __init__(self) -> None: self._slurm_launcher = SlurmLauncher() self._pbs_launcher = PBSLauncher() - self._set_timeout(self._timeout) - @property def is_connected(self) -> bool: return self._connector.is_connected From 04b56ec0a4b150411f1647110d925e10584833d8 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 16 Apr 2024 06:13:44 -0500 Subject: [PATCH 014/101] Hotfix --- smartsim/_core/launcher/dragon/dragonConnector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 2ed932469..e9c1f3770 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -105,7 +105,7 @@ def is_connected(self) -> bool: def _handshake(self, address: str) -> None: self._dragon_head_socket, self._authenticator = dragonSockets.get_secure_socket( - self._context, zmq.REQ, False, self._authenticator + self._context, zmq.REQ, False ) self._dragon_head_socket.connect(address) try: @@ -189,7 +189,7 @@ def connect_to_dragon(self) -> None: self._set_timeout(self._startup_timeout) connector_socket, self._authenticator = dragonSockets.get_secure_socket( - self._context, zmq.REP, True, self._authenticator + self._context, zmq.REP, True ) # find first available port >= 5995 port = find_free_port(start=5995) From 97f22586e612e1640d305569dd193039ac8a5566 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 16 Apr 2024 13:00:34 -0500 Subject: [PATCH 015/101] Fix stop behavior for dragon server --- .../_core/launcher/dragon/dragonBackend.py | 126 ++++++++++++------ .../_core/launcher/dragon/dragonConnector.py | 9 +- .../_core/launcher/dragon/dragonSockets.py | 3 +- 3 files changed, 91 insertions(+), 47 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 14f384865..794bf4fd2 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -117,6 +117,7 @@ def __init__(self, pid: int) -> None: self._queued_steps: "collections.OrderedDict[str, DragonRunRequest]" = ( collections.OrderedDict() ) + self._stop_requests: t.Deque[str] = collections.deque() self._running_steps: t.List[str] = [] self._completed_steps: t.List[str] = [] @@ -219,6 +220,7 @@ def _start_redirect_workers( out_file: t.Optional[str], err_file: t.Optional[str], ) -> None: + print("INSIDE START REDIR") grp_redir = ProcessGroup(restart=False, policy=global_policy) for pol, puid in zip(policies, puids): proc = Process(None, ident=puid) @@ -242,8 +244,37 @@ def _start_redirect_workers( policy=pol, ), ) + print("INIT REDIR GRP") grp_redir.init() + print("START REDIR GRP") grp_redir.start() + print("EXIT REDIR GRP") + + def _stop_steps(self) -> None: + print(f"Steps to stop {self._stop_requests}") + while len(self._stop_requests) > 0: + request = self._stop_requests.popleft() + print(f"Stopping step {request.step_id}") + if request.step_id in self._queued_steps: + self._group_infos[request.step_id].status = SmartSimStatus.STATUS_CANCELLED + self._group_infos[request.step_id].return_codes = [-9] + self._queued_steps.pop(request.step_id) + elif request.step_id in self._group_infos: + # Technically we could just terminate, but what if + # the application intercepts that and ignores it? + proc_group = self._group_infos[request.step_id].process_group + if proc_group is None: + self._group_infos[request.step_id].status = SmartSimStatus.STATUS_CANCELLED + self._group_infos[request.step_id].return_codes = [-9] + elif proc_group.status not in TERMINAL_STATUSES: + try: + proc_group.kill() + except DragonProcessGroupError: + try: + proc_group.stop() + except DragonProcessGroupError: + print("Process group already stopped") + def _start_steps(self) -> None: started = [] @@ -262,7 +293,7 @@ def _start_steps(self) -> None: ) policies = [] - for node_name in hosts[: request.nodes]: + for node_name in hosts: local_policy = Policy( placement=Policy.Placement.HOST_NAME, host_name=node_name ) @@ -278,20 +309,30 @@ def _start_steps(self) -> None: ) grp.add_process(nproc=request.tasks_per_node, template=tmp_proc) - grp.init() - grp.start() - puids = grp.puids - self._group_infos[step_id] = ProcessGroupInfo( - process_group=grp, - puids=puids, - return_codes=[], - status=SmartSimStatus.STATUS_RUNNING, - hosts=hosts, - ) - self._running_steps.append(step_id) - started.append(step_id) + try: + print("init") + grp.init() + print("start") + grp.start() + except Exception as e: + print(e) + + try: + puids = grp.puids + self._group_infos[step_id] = ProcessGroupInfo( + process_group=grp, + puids=puids, + return_codes=[], + status=SmartSimStatus.STATUS_RUNNING, + hosts=hosts, + ) + self._running_steps.append(step_id) + started.append(step_id) + except Exception as e: + print(e) try: + print("Redir") DragonBackend._start_redirect_workers( global_policy, policies, @@ -299,6 +340,7 @@ def _start_steps(self) -> None: request.output_file, request.error_file, ) + print("Redir'd") except Exception as e: raise IOError("Could not redirect output") from e @@ -306,7 +348,10 @@ def _start_steps(self) -> None: print(f"{self._updates}: {started=}") for step_id in started: - self._queued_steps.pop(step_id) + try: + self._queued_steps.pop(step_id) + except KeyError as e: + print(e) def _refresh_statuses(self) -> None: terminated = [] @@ -317,7 +362,7 @@ def _refresh_statuses(self) -> None: if grp is None: group_info.status = SmartSimStatus.STATUS_FAILED group_info.return_codes = [-1] - else: + elif grp.status not in TERMINAL_STATUSES: if grp.status == DRG_RUNNING_STATUS: group_info.status = SmartSimStatus.STATUS_RUNNING else: @@ -332,12 +377,13 @@ def _refresh_statuses(self) -> None: group_info.return_codes = [-1 for _ in puids] else: group_info.return_codes = [0] - group_info.status = ( - SmartSimStatus.STATUS_FAILED - if any(group_info.return_codes) - or grp.status == DRG_ERROR_STATUS - else SmartSimStatus.STATUS_COMPLETED - ) + if not group_info.status == SmartSimStatus.STATUS_CANCELLED: + group_info.status = ( + SmartSimStatus.STATUS_FAILED + if any(group_info.return_codes) + or grp.status == DRG_ERROR_STATUS + else SmartSimStatus.STATUS_COMPLETED + ) if group_info.status in TERMINAL_STATUSES: terminated.append(step_id) @@ -363,13 +409,21 @@ def _update_shutdown_status(self) -> None: def update(self, interval: float=0.01) -> None: while True: - self._updates += 1 - self._start_steps() - self._refresh_statuses() - self._update_shutdown_status() - time.sleep(0.01) - if (self._updates % int(10/interval)) == 0: - self.print_status() + try: + self._updates += 1 + print("STOP") + self._stop_steps() + print("START") + self._start_steps() + print("REFRESH") + self._refresh_statuses() + print("UPDATE SHUTDOWN") + self._update_shutdown_status() + except Exception as e: + print(e) + time.sleep(0.1) + # if (self._updates % int(10/interval)) == 0: + self.print_status() @process_request.register def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: @@ -383,21 +437,7 @@ def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: @process_request.register def _(self, request: DragonStopRequest) -> DragonStopResponse: - if request.step_id in self._group_infos: - # Technically we could just terminate, but what if - # the application intercepts that and ignores it? - proc_group = self._group_infos[request.step_id].process_group - if proc_group is None: - self._group_infos[request.step_id].status = SmartSimStatus.STATUS_FAILED - elif proc_group.status not in TERMINAL_STATUSES: - try: - proc_group.kill() - except DragonProcessGroupError: - try: - proc_group.stop() - except DragonProcessGroupError: - print("Process group already stopped") - + self._stop_requests.append(request) return DragonStopResponse() @process_request.register diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index e9c1f3770..164fd4058 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -41,6 +41,7 @@ import psutil import zmq +import zmq.auth.thread from smartsim._core.launcher.dragon import dragonSockets from smartsim.error.errors import SmartSimError @@ -104,7 +105,7 @@ def is_connected(self) -> bool: return self._dragon_head_socket is not None def _handshake(self, address: str) -> None: - self._dragon_head_socket, self._authenticator = dragonSockets.get_secure_socket( + self._dragon_head_socket = dragonSockets.get_secure_socket( self._context, zmq.REQ, False ) self._dragon_head_socket.connect(address) @@ -147,6 +148,8 @@ def connect_to_dragon(self) -> None: path = _resolve_dragon_path(self._dragon_server_path) dragon_config_log = path / CONFIG.dragon_log_filename + self._authenticator = dragonSockets.get_authenticator(self._context) + if dragon_config_log.is_file(): dragon_confs = self._parse_launched_dragon_server_info_from_files( [dragon_config_log] @@ -188,7 +191,7 @@ def connect_to_dragon(self) -> None: if address is not None: self._set_timeout(self._startup_timeout) - connector_socket, self._authenticator = dragonSockets.get_secure_socket( + connector_socket = dragonSockets.get_secure_socket( self._context, zmq.REP, True ) # find first available port >= 5995 @@ -354,7 +357,7 @@ def _dragon_cleanup( """ try: if server_socket is not None: - DragonConnector.send_req_with_socket(server_socket, DragonShutdownRequest()) + DragonConnector._send_req_with_socket(server_socket, DragonShutdownRequest()) except zmq.error.ZMQError as e: # Can't use the logger as I/O file may be closed print("Could not send shutdown request to dragon server") diff --git a/smartsim/_core/launcher/dragon/dragonSockets.py b/smartsim/_core/launcher/dragon/dragonSockets.py index ca693428d..2d76f7f01 100644 --- a/smartsim/_core/launcher/dragon/dragonSockets.py +++ b/smartsim/_core/launcher/dragon/dragonSockets.py @@ -93,7 +93,7 @@ def get_secure_socket( # configure the server keys on the socket socket.curve_secretkey = server_keys.private - socket.curve_publickey = client_keys.public + socket.curve_publickey = server_keys.public socket.curve_server = True else: # configure client keys on the socket to encrypt outgoing messages @@ -114,6 +114,7 @@ def get_authenticator( :type context: zmq.Context :returns: the activated `Authenticator` :rtype: zmq.auth.thread.ThreadAuthenticator""" + config = get_config() key_manager = KeyManager(config, as_client=True) From fe3fae28ba092f7648b9e145354e49bfba58f634 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 16 Apr 2024 15:15:13 -0500 Subject: [PATCH 016/101] Fix stop/start race condition --- .../_core/launcher/dragon/dragonBackend.py | 45 +++++++------------ .../_core/launcher/dragon/dragonSockets.py | 2 +- 2 files changed, 18 insertions(+), 29 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 794bf4fd2..3c15846f3 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -220,7 +220,6 @@ def _start_redirect_workers( out_file: t.Optional[str], err_file: t.Optional[str], ) -> None: - print("INSIDE START REDIR") grp_redir = ProcessGroup(restart=False, policy=global_policy) for pol, puid in zip(policies, puids): proc = Process(None, ident=puid) @@ -244,29 +243,25 @@ def _start_redirect_workers( policy=pol, ), ) - print("INIT REDIR GRP") grp_redir.init() - print("START REDIR GRP") grp_redir.start() - print("EXIT REDIR GRP") def _stop_steps(self) -> None: - print(f"Steps to stop {self._stop_requests}") while len(self._stop_requests) > 0: request = self._stop_requests.popleft() - print(f"Stopping step {request.step_id}") + step_id = request.step_id + if step_id not in self._group_infos: + print(f"Requested to stop non-existing step {step_id}") + continue + + print(f"Stopping step {step_id}") if request.step_id in self._queued_steps: - self._group_infos[request.step_id].status = SmartSimStatus.STATUS_CANCELLED - self._group_infos[request.step_id].return_codes = [-9] - self._queued_steps.pop(request.step_id) - elif request.step_id in self._group_infos: + self._queued_steps.pop(step_id) + else: # Technically we could just terminate, but what if # the application intercepts that and ignores it? - proc_group = self._group_infos[request.step_id].process_group - if proc_group is None: - self._group_infos[request.step_id].status = SmartSimStatus.STATUS_CANCELLED - self._group_infos[request.step_id].return_codes = [-9] - elif proc_group.status not in TERMINAL_STATUSES: + proc_group = self._group_infos[step_id].process_group + if proc_group is None and proc_group.status not in TERMINAL_STATUSES: try: proc_group.kill() except DragonProcessGroupError: @@ -275,6 +270,9 @@ def _stop_steps(self) -> None: except DragonProcessGroupError: print("Process group already stopped") + self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED + self._group_infos[step_id].return_codes = [-9] + def _start_steps(self) -> None: started = [] @@ -310,9 +308,7 @@ def _start_steps(self) -> None: grp.add_process(nproc=request.tasks_per_node, template=tmp_proc) try: - print("init") grp.init() - print("start") grp.start() except Exception as e: print(e) @@ -332,7 +328,6 @@ def _start_steps(self) -> None: print(e) try: - print("Redir") DragonBackend._start_redirect_workers( global_policy, policies, @@ -340,7 +335,6 @@ def _start_steps(self) -> None: request.output_file, request.error_file, ) - print("Redir'd") except Exception as e: raise IOError("Could not redirect output") from e @@ -358,11 +352,10 @@ def _refresh_statuses(self) -> None: for step_id in self._running_steps: group_info = self._group_infos[step_id] grp = group_info.process_group - if grp is None: group_info.status = SmartSimStatus.STATUS_FAILED group_info.return_codes = [-1] - elif grp.status not in TERMINAL_STATUSES: + elif group_info.status not in TERMINAL_STATUSES: if grp.status == DRG_RUNNING_STATUS: group_info.status = SmartSimStatus.STATUS_RUNNING else: @@ -411,19 +404,15 @@ def update(self, interval: float=0.01) -> None: while True: try: self._updates += 1 - print("STOP") self._stop_steps() - print("START") self._start_steps() - print("REFRESH") self._refresh_statuses() - print("UPDATE SHUTDOWN") self._update_shutdown_status() + time.sleep(0.1) except Exception as e: print(e) - time.sleep(0.1) - # if (self._updates % int(10/interval)) == 0: - self.print_status() + if (self._updates % int(10/interval)) == 0: + self.print_status() @process_request.register def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: diff --git a/smartsim/_core/launcher/dragon/dragonSockets.py b/smartsim/_core/launcher/dragon/dragonSockets.py index 2d76f7f01..71de1e15d 100644 --- a/smartsim/_core/launcher/dragon/dragonSockets.py +++ b/smartsim/_core/launcher/dragon/dragonSockets.py @@ -114,7 +114,7 @@ def get_authenticator( :type context: zmq.Context :returns: the activated `Authenticator` :rtype: zmq.auth.thread.ThreadAuthenticator""" - + return None config = get_config() key_manager = KeyManager(config, as_client=True) From 43ffdcfb09fd5f7585746f8d168ce3aba1be0290 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 16 Apr 2024 16:53:50 -0500 Subject: [PATCH 017/101] Increase server timeout --- smartsim/_core/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 0bc3afff9..51d65babb 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -166,7 +166,7 @@ def dragon_server_reconnect_timeout(self) -> int: @property def dragon_server_timeout(self) -> int: - return int(os.getenv("SMARTSIM_DRAGON_TIMEOUT", "5000")) + return int(os.getenv("SMARTSIM_DRAGON_TIMEOUT", "10000")) @property def dragon_server_startup_timeout(self) -> int: From 0e3c1ff59e0aec605a1036320832e0401d218c76 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 16 Apr 2024 17:15:45 -0500 Subject: [PATCH 018/101] Restore authenticator --- .../_core/launcher/dragon/dragonConnector.py | 22 +++++++++---------- .../_core/launcher/dragon/dragonSockets.py | 1 - 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 164fd4058..2ebd7743d 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -365,18 +365,16 @@ def _dragon_cleanup( finally: time.sleep(1) - if not psutil.pid_exists(server_process_pid) or not server_process_pid: - return - - try: - os.kill(server_process_pid, signal.SIGINT) - print("Sent SIGINT to dragon server") - time.sleep(5) - if psutil.pid_exists(server_process_pid): - os.kill(server_process_pid, signal.SIGTERM) - except ProcessLookupError: - # Can't use the logger as I/O file may be closed - print("Dragon server is not running.", flush=True) + if psutil.pid_exists(server_process_pid) and server_process_pid: + try: + os.kill(server_process_pid, signal.SIGINT) + print("Sent SIGINT to dragon server") + time.sleep(5) + if psutil.pid_exists(server_process_pid): + os.kill(server_process_pid, signal.SIGTERM) + except ProcessLookupError: + # Can't use the logger as I/O file may be closed + print("Dragon server is not running.", flush=True) try: diff --git a/smartsim/_core/launcher/dragon/dragonSockets.py b/smartsim/_core/launcher/dragon/dragonSockets.py index f071a81fe..1de9d8819 100644 --- a/smartsim/_core/launcher/dragon/dragonSockets.py +++ b/smartsim/_core/launcher/dragon/dragonSockets.py @@ -115,7 +115,6 @@ def get_authenticator( :type context: zmq.Context :returns: the activated `Authenticator` :rtype: zmq.auth.thread.ThreadAuthenticator""" - return None config = get_config() key_manager = KeyManager(config, as_client=True) From 2feceb91b9e0e4ea32e83b5d4d6b53c52647b60c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 17 Apr 2024 01:38:57 -0500 Subject: [PATCH 019/101] 20 seconds should be enough --- smartsim/_core/config/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 51d65babb..6e892ab27 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -162,11 +162,11 @@ def dragon_server_path(self) -> t.Optional[str]: @property def dragon_server_reconnect_timeout(self) -> int: - return int(os.getenv("SMARTSIM_DRAGON_RECONNECT_TIMEOUT", "5000")) + return int(os.getenv("SMARTSIM_DRAGON_RECONNECT_TIMEOUT", "20000")) @property def dragon_server_timeout(self) -> int: - return int(os.getenv("SMARTSIM_DRAGON_TIMEOUT", "10000")) + return int(os.getenv("SMARTSIM_DRAGON_TIMEOUT", "20000")) @property def dragon_server_startup_timeout(self) -> int: From cb8d690d59388d7fbbd42281f2d8e4f24d0e684e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 17 Apr 2024 07:07:46 -0500 Subject: [PATCH 020/101] Final fix for auth respawning --- smartsim/_core/entrypoints/dragon.py | 5 ++ .../_core/launcher/dragon/dragonConnector.py | 62 +++++++++++++------ .../full_wlm/test_generic_orc_launch_batch.py | 2 +- 3 files changed, 48 insertions(+), 21 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 61d9fa9eb..e53c1c697 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -107,6 +107,7 @@ def run( server = dragonSockets.as_server(dragon_head_socket) logger.debug(f"Listening to {dragon_head_address}") + while not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): try: req = server.recv() @@ -154,6 +155,10 @@ def main(args: argparse.Namespace) -> int: if args.launching_address: zmq_context = zmq.Context() + zmq_context.setsockopt(zmq.SNDTIMEO, value=20000) + zmq_context.setsockopt(zmq.RCVTIMEO, value=20000) + zmq_context.setsockopt(zmq.REQ_CORRELATE, 1) + zmq_context.setsockopt(zmq.REQ_RELAXED, 1) if str(args.launching_address).split(":", maxsplit=1)[0] == dragon_head_address: address = "localhost" diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 2ebd7743d..80933f974 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -64,31 +64,30 @@ _SchemaT = t.TypeVar("_SchemaT", bound=t.Union[DragonRequest, DragonResponse]) DRG_LOCK = RLock() -DRG_CTX = zmq.Context() -DRG_CTX.setsockopt(zmq.REQ_CORRELATE, 1) -DRG_CTX.setsockopt(zmq.REQ_RELAXED, 1) class DragonConnector: """This class encapsulates the functionality needed - to launch start a Dragon server and communicate with it. + to start a Dragon server and communicate with it. """ def __init__(self) -> None: super().__init__() - self._context = DRG_CTX + self._context = None + self._context = zmq.Context() + self._context.setsockopt(zmq.REQ_CORRELATE, 1) + self._context.setsockopt(zmq.REQ_RELAXED, 1) + self._authenticator = dragonSockets.get_authenticator(self._context) self._timeout = CONFIG.dragon_server_timeout self._reconnect_timeout = CONFIG.dragon_server_reconnect_timeout self._startup_timeout = CONFIG.dragon_server_startup_timeout - self._context.setsockopt(zmq.SNDTIMEO, value=self._timeout) - self._context.setsockopt(zmq.RCVTIMEO, value=self._timeout) + self._set_timeout(self._timeout) self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None # Returned by dragon head, useful if shutdown is to be requested # but process was started by another connector self._dragon_head_pid: t.Optional[int] = None - self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None self._dragon_server_path = CONFIG.dragon_server_path logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") if self._dragon_server_path is None: @@ -119,6 +118,10 @@ def _handshake(self, address: str) -> None: ) except (zmq.ZMQError, zmq.Again) as e: logger.debug(e) + try: + self._authenticator.stop() + except zmq.Again: + logger.error("Could not stop authenticator") self._dragon_head_socket.close() self._dragon_head_socket = None raise SmartSimError( @@ -128,6 +131,10 @@ def _handshake(self, address: str) -> None: def _set_timeout(self, timeout: int) -> None: self._context.setsockopt(zmq.SNDTIMEO, value=timeout) self._context.setsockopt(zmq.RCVTIMEO, value=timeout) + if self._authenticator is not None and self._authenticator.thread is not None: + self._authenticator.thread.authenticator.zap_socket.setsockopt(zmq.SNDTIMEO, optval=timeout) + self._authenticator.thread.authenticator.zap_socket.setsockopt(zmq.RCVTIMEO, optval=timeout) + def ensure_connected(self) -> None: if not self.is_connected: @@ -148,7 +155,6 @@ def connect_to_dragon(self) -> None: path = _resolve_dragon_path(self._dragon_server_path) dragon_config_log = path / CONFIG.dragon_log_filename - self._authenticator = dragonSockets.get_authenticator(self._context) if dragon_config_log.is_file(): dragon_confs = self._parse_launched_dragon_server_info_from_files( @@ -167,11 +173,17 @@ def connect_to_dragon(self) -> None: self._handshake(dragon_conf["address"]) except SmartSimError as e: logger.warning(e) + logger.debug("Closing ZAP socket") + if self._authenticator.thread is not None: + self._authenticator.thread.authenticator.zap_socket.close() + logger.debug("Getting new auth") + self._authenticator = dragonSockets.get_authenticator(self._context) finally: self._set_timeout(self._timeout) if self.is_connected: return + path.mkdir(parents=True, exist_ok=True) cmd = [ @@ -194,6 +206,7 @@ def connect_to_dragon(self) -> None: connector_socket = dragonSockets.get_secure_socket( self._context, zmq.REP, True ) + # find first available port >= 5995 port = find_free_port(start=5995) socket_addr = f"tcp://{address}:{port}" @@ -210,6 +223,8 @@ def connect_to_dragon(self) -> None: ) as dragon_err: current_env = os.environ.copy() current_env.update({"PYTHONUNBUFFERED": "1"}) + logger.debug(f"Starting Dragon environment: {' '.join(cmd)}") + # pylint: disable-next=consider-using-with self._dragon_head_process = subprocess.Popen( args=cmd, @@ -281,6 +296,9 @@ def cleanup(self) -> None: server_process_pid=self._dragon_head_pid, server_authenticator=self._authenticator, ) + self._dragon_head_socket = None + self._dragon_head_pid = 0 + self._authenticator = None def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse: self.ensure_connected() @@ -318,6 +336,7 @@ def _parse_launched_dragon_server_info_from_files( dragon_envs = cls._parse_launched_dragon_server_info_from_iterable( ifstream, num_dragon_envs ) + return dragon_envs @staticmethod @@ -328,8 +347,6 @@ def _send_req_with_socket( with DRG_LOCK: logger.debug(f"Sending {type(request).__name__}: {request}") client.send(request, flags) - - time.sleep(0.1) response = client.recv() logger.debug(f"Received {type(response).__name__}: {response}") @@ -357,13 +374,23 @@ def _dragon_cleanup( """ try: if server_socket is not None: + print("Sending shutdown request to dragon environment") DragonConnector._send_req_with_socket(server_socket, DragonShutdownRequest()) - except zmq.error.ZMQError as e: + except (zmq.error.ZMQError, zmq.Again) as e: # Can't use the logger as I/O file may be closed print("Could not send shutdown request to dragon server") print(f"ZMQ error: {e}", flush=True) finally: - time.sleep(1) + print("Sending shutdown request is complete") + + try: + if server_authenticator is not None and server_authenticator.is_alive(): + print("Shutting down ZMQ authenticator") + server_authenticator.stop() + except Exception: + print("Authenticator shutdown error") + finally: + print("Authenticator shutdown is complete") if psutil.pid_exists(server_process_pid) and server_process_pid: try: @@ -375,13 +402,8 @@ def _dragon_cleanup( except ProcessLookupError: # Can't use the logger as I/O file may be closed print("Dragon server is not running.", flush=True) - - - try: - if server_authenticator is not None: - server_authenticator.stop() - except Exception: - print("Authenticator shutdown error") + finally: + print("Dragon server process shutdown is complete") def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index d097aa439..fc725f4b3 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -163,7 +163,7 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) - orc.batch_settings.set_walltime("00:01:00") + orc.batch_settings.set_walltime("00:02:00") exp.start(orc, block=True) From 352f3299643fa051c4c0b0fdce84c2e90baf64a0 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 17 Apr 2024 11:33:49 -0500 Subject: [PATCH 021/101] Fix test, switch to logger in backend --- smartsim/_core/entrypoints/dragon.py | 17 +++-- .../_core/launcher/dragon/dragonBackend.py | 63 ++++++++++--------- tests/test_dragon_launcher.py | 34 ++++++---- 3 files changed, 70 insertions(+), 44 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index e53c1c697..b91040263 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -36,6 +36,7 @@ from types import FrameType import zmq +import zmq.auth.thread from smartsim._core.launcher.dragon import dragonSockets from smartsim._core.launcher.dragon.dragonBackend import DragonBackend @@ -87,13 +88,16 @@ def run( zmq_context: "zmq.Context[t.Any]", dragon_head_address: str, dragon_pid: int, + zmq_authenticator: "zmq.auth.thread.ThreadAuthenticator", ) -> None: logger.debug(f"Opening socket {dragon_head_address}") - zmq_context.setsockopt(zmq.SNDTIMEO, value=1000) - zmq_context.setsockopt(zmq.RCVTIMEO, value=1000) + zmq_context.setsockopt(zmq.SNDTIMEO, value=-1) + zmq_context.setsockopt(zmq.RCVTIMEO, value=-1) zmq_context.setsockopt(zmq.REQ_CORRELATE, 1) zmq_context.setsockopt(zmq.REQ_RELAXED, 1) + zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.SNDTIMEO, optval=-1) + zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.RCVTIMEO, optval=-1) dragon_head_socket = dragonSockets.get_secure_socket(zmq_context, zmq.REP, True) dragon_head_socket.bind(dragon_head_address) @@ -114,7 +118,7 @@ def run( logger.debug(f"Received {type(req).__name__} {req}") except zmq.Again: if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): - logger.debug(f"Listening to {dragon_head_address}") + # logger.debug(f"Listening to {dragon_head_address}") continue logger.info("Shutdown has been requested") break @@ -155,8 +159,8 @@ def main(args: argparse.Namespace) -> int: if args.launching_address: zmq_context = zmq.Context() - zmq_context.setsockopt(zmq.SNDTIMEO, value=20000) - zmq_context.setsockopt(zmq.RCVTIMEO, value=20000) + zmq_context.setsockopt(zmq.SNDTIMEO, value=-1) + zmq_context.setsockopt(zmq.RCVTIMEO, value=-1) zmq_context.setsockopt(zmq.REQ_CORRELATE, 1) zmq_context.setsockopt(zmq.REQ_RELAXED, 1) @@ -167,6 +171,8 @@ def main(args: argparse.Namespace) -> int: dragon_head_address += ":5555" zmq_authenticator = dragonSockets.get_authenticator(zmq_context) + zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.SNDTIMEO, optval=-1) + zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.RCVTIMEO, optval=-1) logger.debug("Getting launcher socket") launcher_socket = dragonSockets.get_secure_socket(zmq_context, zmq.REQ, False) @@ -195,6 +201,7 @@ def main(args: argparse.Namespace) -> int: zmq_context=zmq_context, dragon_head_address=dragon_head_address, dragon_pid=response.dragon_pid, + zmq_authenticator=zmq_authenticator, ) except Exception as e: logger.error(f"Dragon server failed with {e}", exc_info=True) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 3c15846f3..ef63246f6 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -43,7 +43,7 @@ # pylint: enable=import-error # isort: on -from smartsim._core.schemas import ( +from ...._core.schemas import ( DragonHandshakeRequest, DragonHandshakeResponse, DragonRequest, @@ -57,12 +57,14 @@ DragonUpdateStatusRequest, DragonUpdateStatusResponse, ) -from smartsim._core.utils.helpers import create_short_id_str -from smartsim.status import TERMINAL_STATUSES, SmartSimStatus +from ...._core.utils.helpers import create_short_id_str +from ....status import TERMINAL_STATUSES, SmartSimStatus +from ....log import get_logger DRG_ERROR_STATUS = "Error" DRG_RUNNING_STATUS = "Running" +logger = get_logger(__name__) @dataclass class ProcessGroupInfo: @@ -120,24 +122,21 @@ def __init__(self, pid: int) -> None: self._stop_requests: t.Deque[str] = collections.deque() self._running_steps: t.List[str] = [] self._completed_steps: t.List[str] = [] - + self._last_update_time: int = time.time_ns() // 1e9 num_hosts = len(self._hosts) host_string = str(num_hosts) + (" hosts" if num_hosts > 1 else " host") self._shutdown_requested = False self._can_shutdown = False self._frontend_shutdown: t.Optional[bool] = None - self._updates = 0 - print(f"{host_string} available for execution: {self._hosts}") + logger.debug(f"{host_string} available for execution: {self._hosts}") def print_status(self) -> None: - print("\n-----------------------Launcher Status-----------------------") - print(f"| {self._updates}: System hosts: ", self._hosts) - print(f"| {self._updates}: Free hosts: ", list(self._free_hosts)) - print(f"| {self._updates}: Allocated hosts: ", self._allocated_hosts) - print(f"| {self._updates}: Running steps: ", self._running_steps) - print(f"| {self._updates}: Group infos: ", self._group_infos) - print(f"| {self._updates}: There are {len(self._queued_steps)} queued steps") - print("-------------------------------------------------------------\n") + logger.debug(f"System hosts: {self._hosts}") + logger.debug(f"Free hosts: {list(self._free_hosts)}") + logger.debug(f"Allocated hosts: {self._allocated_hosts}") + logger.debug(f"Running steps: {self._running_steps}") + logger.debug(f"Group infos: {self._group_infos}") + logger.debug(f"There are {len(self._queued_steps)} queued steps") @property def frontend_shutdown(self) -> bool: @@ -251,10 +250,10 @@ def _stop_steps(self) -> None: request = self._stop_requests.popleft() step_id = request.step_id if step_id not in self._group_infos: - print(f"Requested to stop non-existing step {step_id}") + logger.error(f"Requested to stop non-existing step {step_id}") continue - print(f"Stopping step {step_id}") + logger.debug(f"Stopping step {step_id}") if request.step_id in self._queued_steps: self._queued_steps.pop(step_id) else: @@ -268,7 +267,7 @@ def _stop_steps(self) -> None: try: proc_group.stop() except DragonProcessGroupError: - print("Process group already stopped") + logger.error("Process group already stopped") self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED self._group_infos[step_id].return_codes = [-9] @@ -281,7 +280,7 @@ def _start_steps(self) -> None: if not hosts: continue - print(f"Step id {step_id} allocated on {hosts}") + logger.debug(f"Step id {step_id} allocated on {hosts}") global_policy = Policy( placement=Policy.Placement.HOST_NAME, host_name=hosts[0] @@ -311,7 +310,7 @@ def _start_steps(self) -> None: grp.init() grp.start() except Exception as e: - print(e) + logger.error(e) try: puids = grp.puids @@ -325,7 +324,7 @@ def _start_steps(self) -> None: self._running_steps.append(step_id) started.append(step_id) except Exception as e: - print(e) + logger.error(e) try: DragonBackend._start_redirect_workers( @@ -339,13 +338,13 @@ def _start_steps(self) -> None: raise IOError("Could not redirect output") from e if started: - print(f"{self._updates}: {started=}") + logger.debug(f"{started=}") for step_id in started: try: self._queued_steps.pop(step_id) except KeyError as e: - print(e) + logger.error(e) def _refresh_statuses(self) -> None: terminated = [] @@ -366,7 +365,7 @@ def _refresh_statuses(self) -> None: Process(None, ident=puid).returncode for puid in puids ] except (ValueError, TypeError) as e: - print(e) + logger.error(e) group_info.return_codes = [-1 for _ in puids] else: group_info.return_codes = [0] @@ -382,7 +381,7 @@ def _refresh_statuses(self) -> None: terminated.append(step_id) if terminated: - print(f"{self._updates}: {terminated=}", flush=True) + logger.debug(f"{terminated=}") for step_id in terminated: self._running_steps.remove(step_id) self._completed_steps.append(step_id) @@ -390,7 +389,7 @@ def _refresh_statuses(self) -> None: if group_info is not None: with self._hostlist_lock: for host in group_info.hosts: - print(f"{self._updates}: Releasing host {host}", flush=True) + logger.debug(f"Releasing host {host}") self._allocated_hosts.pop(host) self._free_hosts.append(host) @@ -400,18 +399,26 @@ def _update_shutdown_status(self) -> None: for grp_info in self._group_infos.values() ) + def _should_update(self): + current_time = time.time_ns() // 1e9 + if current_time - self._last_update_time > 10: + self._last_update_time = current_time + return True + return False + def update(self, interval: float=0.01) -> None: + logger.debug("Dragon Backend update thread started") + time.time_ns while True: try: - self._updates += 1 self._stop_steps() self._start_steps() self._refresh_statuses() self._update_shutdown_status() time.sleep(0.1) except Exception as e: - print(e) - if (self._updates % int(10/interval)) == 0: + logger.error(e) + if self._should_update(): self.print_status() @process_request.register diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index cdbc830fa..78e2641c5 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -29,17 +29,20 @@ import os import sys import typing as t +import zmq import pytest -from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher +from smartsim._core.config.config import get_config +from smartsim._core.launcher.dragon.dragonLauncher import DragonConnector, DragonLauncher from smartsim._core.launcher.dragon.dragonSockets import ( get_authenticator, get_secure_socket, ) from smartsim._core.schemas.dragonRequests import DragonBootstrapRequest -from smartsim._core.utils.network import IFConfig -from smartsim.error.errors import LauncherError +from smartsim._core.schemas.dragonResponses import DragonHandshakeResponse +from smartsim._core.utils.network import IFConfig, find_free_port +from smartsim._core.utils.security import KeyManager # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -53,7 +56,7 @@ def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: ... @property def pid(self) -> int: - return 1 + return 99999 @property def returncode(self) -> int: @@ -95,6 +98,7 @@ def __init__(self, context: zmq.Context) -> None: self.num_stops: int = 0 self.num_configure_curves: int = 0 self.context = context + self.thread = None def configure_curve(self, *args, **kwargs) -> None: self.cfg_args = args @@ -179,18 +183,26 @@ def test_dragon_connect_bind_address(monkeypatch: pytest.MonkeyPatch, test_dir: in the range supplied""" with monkeypatch.context() as ctx: + # make sure we don't touch "real keys" during a test + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + mock_socket = MockSocket() + + # look at test_dir for dragon config ctx.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) + # avoid finding real interface ctx.setattr( "smartsim._core.launcher.dragon.dragonConnector.get_best_interface_and_address", lambda: IFConfig(interface="faux_interface", address="127.0.0.1"), ) + # we need to set the socket value or is_connected returns False ctx.setattr( "smartsim._core.launcher.dragon.dragonLauncher.DragonConnector._handshake", lambda self, address: ..., ) - - mock_socket = MockSocket() - + # avoid starting a real authenticator thread + ctx.setattr("zmq.auth.thread.ThreadAuthenticator", MockAuthenticator) + # avoid starting a real zmq socket ctx.setattr("zmq.Context.socket", mock_socket) ctx.setattr("subprocess.Popen", lambda *args, **kwargs: MockPopen()) @@ -318,7 +330,7 @@ def test_dragon_launcher_handshake(monkeypatch: pytest.MonkeyPatch, test_dir: st ctx.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) # avoid finding real interface since we may not be on a super ctx.setattr( - "smartsim._core.launcher.dragon.dragonLauncher.get_best_interface_and_address", + "smartsim._core.launcher.dragon.dragonConnector.get_best_interface_and_address", lambda: IFConfig("faux_interface", addr), ) @@ -336,11 +348,11 @@ def fn(*args, **kwargs): ctx.setattr("subprocess.Popen", fn) - launcher = DragonLauncher() + connector = DragonConnector() try: # connect executes the complete handshake and raises an exception if comms fails - launcher.connect_to_dragon(test_dir) + connector.connect_to_dragon() finally: - launcher.cleanup() + connector.cleanup() ... From 7e3fd9b4297688883ec50bd27a01800c03148cb3 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 17 Apr 2024 11:49:37 -0500 Subject: [PATCH 022/101] Comply to standards --- smartsim/_core/entrypoints/dragon.py | 8 ++-- .../_core/launcher/dragon/dragonBackend.py | 42 ++++++++++-------- .../_core/launcher/dragon/dragonConnector.py | 43 ++++++++++++------- .../_core/launcher/dragon/dragonLauncher.py | 4 +- smartsim/_core/launcher/launcher.py | 4 +- smartsim/_core/launcher/local/local.py | 2 +- smartsim/_core/launcher/step/dragonStep.py | 3 +- tests/test_dragon_launcher.py | 7 ++- 8 files changed, 67 insertions(+), 46 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index b91040263..b6921f4ec 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -96,8 +96,8 @@ def run( zmq_context.setsockopt(zmq.RCVTIMEO, value=-1) zmq_context.setsockopt(zmq.REQ_CORRELATE, 1) zmq_context.setsockopt(zmq.REQ_RELAXED, 1) - zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.SNDTIMEO, optval=-1) - zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.RCVTIMEO, optval=-1) + zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.SNDTIMEO, -1) + zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.RCVTIMEO, -1) dragon_head_socket = dragonSockets.get_secure_socket(zmq_context, zmq.REP, True) dragon_head_socket.bind(dragon_head_address) @@ -171,8 +171,8 @@ def main(args: argparse.Namespace) -> int: dragon_head_address += ":5555" zmq_authenticator = dragonSockets.get_authenticator(zmq_context) - zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.SNDTIMEO, optval=-1) - zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.RCVTIMEO, optval=-1) + zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.SNDTIMEO, -1) + zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.RCVTIMEO, -1) logger.debug("Getting launcher socket") launcher_socket = dragonSockets.get_secure_socket(zmq_context, zmq.REQ, False) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index ef63246f6..5f530ea9c 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -58,14 +58,15 @@ DragonUpdateStatusResponse, ) from ...._core.utils.helpers import create_short_id_str -from ....status import TERMINAL_STATUSES, SmartSimStatus from ....log import get_logger +from ....status import TERMINAL_STATUSES, SmartSimStatus DRG_ERROR_STATUS = "Error" DRG_RUNNING_STATUS = "Running" logger = get_logger(__name__) + @dataclass class ProcessGroupInfo: status: SmartSimStatus @@ -119,10 +120,10 @@ def __init__(self, pid: int) -> None: self._queued_steps: "collections.OrderedDict[str, DragonRunRequest]" = ( collections.OrderedDict() ) - self._stop_requests: t.Deque[str] = collections.deque() + self._stop_requests: t.Deque[DragonStopRequest] = collections.deque() self._running_steps: t.List[str] = [] self._completed_steps: t.List[str] = [] - self._last_update_time: int = time.time_ns() // 1e9 + self._last_update_time: float = time.time_ns() / 1e9 num_hosts = len(self._hosts) host_string = str(num_hosts) + (" hosts" if num_hosts > 1 else " host") self._shutdown_requested = False @@ -260,7 +261,10 @@ def _stop_steps(self) -> None: # Technically we could just terminate, but what if # the application intercepts that and ignores it? proc_group = self._group_infos[step_id].process_group - if proc_group is None and proc_group.status not in TERMINAL_STATUSES: + if ( + proc_group is not None + and proc_group.status not in TERMINAL_STATUSES + ): try: proc_group.kill() except DragonProcessGroupError: @@ -272,7 +276,6 @@ def _stop_steps(self) -> None: self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED self._group_infos[step_id].return_codes = [-9] - def _start_steps(self) -> None: started = [] for step_id, request in self._queued_steps.items(): @@ -312,6 +315,7 @@ def _start_steps(self) -> None: except Exception as e: logger.error(e) + puids = None try: puids = grp.puids self._group_infos[step_id] = ProcessGroupInfo( @@ -326,16 +330,17 @@ def _start_steps(self) -> None: except Exception as e: logger.error(e) - try: - DragonBackend._start_redirect_workers( - global_policy, - policies, - puids, - request.output_file, - request.error_file, - ) - except Exception as e: - raise IOError("Could not redirect output") from e + if puids is not None: + try: + DragonBackend._start_redirect_workers( + global_policy, + policies, + puids, + request.output_file, + request.error_file, + ) + except Exception as e: + raise IOError("Could not redirect output") from e if started: logger.debug(f"{started=}") @@ -399,16 +404,15 @@ def _update_shutdown_status(self) -> None: for grp_info in self._group_infos.values() ) - def _should_update(self): - current_time = time.time_ns() // 1e9 + def _should_update(self) -> bool: + current_time = time.time_ns() / 1e9 if current_time - self._last_update_time > 10: self._last_update_time = current_time return True return False - def update(self, interval: float=0.01) -> None: + def update(self) -> None: logger.debug("Dragon Backend update thread started") - time.time_ns while True: try: self._stop_steps() diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 80933f974..3cea7f990 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -74,11 +74,12 @@ class DragonConnector: def __init__(self) -> None: super().__init__() - self._context = None - self._context = zmq.Context() + self._context: zmq.Context[t.Any] = zmq.Context() self._context.setsockopt(zmq.REQ_CORRELATE, 1) self._context.setsockopt(zmq.REQ_RELAXED, 1) - self._authenticator = dragonSockets.get_authenticator(self._context) + self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = ( + dragonSockets.get_authenticator(self._context) + ) self._timeout = CONFIG.dragon_server_timeout self._reconnect_timeout = CONFIG.dragon_server_reconnect_timeout self._startup_timeout = CONFIG.dragon_server_startup_timeout @@ -118,10 +119,11 @@ def _handshake(self, address: str) -> None: ) except (zmq.ZMQError, zmq.Again) as e: logger.debug(e) - try: - self._authenticator.stop() - except zmq.Again: - logger.error("Could not stop authenticator") + if self._authenticator is not None: + try: + self._authenticator.stop() + except zmq.Again: + logger.error("Could not stop authenticator") self._dragon_head_socket.close() self._dragon_head_socket = None raise SmartSimError( @@ -132,9 +134,12 @@ def _set_timeout(self, timeout: int) -> None: self._context.setsockopt(zmq.SNDTIMEO, value=timeout) self._context.setsockopt(zmq.RCVTIMEO, value=timeout) if self._authenticator is not None and self._authenticator.thread is not None: - self._authenticator.thread.authenticator.zap_socket.setsockopt(zmq.SNDTIMEO, optval=timeout) - self._authenticator.thread.authenticator.zap_socket.setsockopt(zmq.RCVTIMEO, optval=timeout) - + self._authenticator.thread.authenticator.zap_socket.setsockopt( + zmq.SNDTIMEO, timeout + ) + self._authenticator.thread.authenticator.zap_socket.setsockopt( + zmq.RCVTIMEO, timeout + ) def ensure_connected(self) -> None: if not self.is_connected: @@ -155,7 +160,6 @@ def connect_to_dragon(self) -> None: path = _resolve_dragon_path(self._dragon_server_path) dragon_config_log = path / CONFIG.dragon_log_filename - if dragon_config_log.is_file(): dragon_confs = self._parse_launched_dragon_server_info_from_files( [dragon_config_log] @@ -174,16 +178,20 @@ def connect_to_dragon(self) -> None: except SmartSimError as e: logger.warning(e) logger.debug("Closing ZAP socket") - if self._authenticator.thread is not None: + if ( + self._authenticator is not None + and self._authenticator.thread is not None + ): self._authenticator.thread.authenticator.zap_socket.close() logger.debug("Getting new auth") - self._authenticator = dragonSockets.get_authenticator(self._context) + self._authenticator = dragonSockets.get_authenticator( + self._context + ) finally: self._set_timeout(self._timeout) if self.is_connected: return - path.mkdir(parents=True, exist_ok=True) cmd = [ @@ -375,7 +383,10 @@ def _dragon_cleanup( try: if server_socket is not None: print("Sending shutdown request to dragon environment") - DragonConnector._send_req_with_socket(server_socket, DragonShutdownRequest()) + # pylint: disable-next=protected-access + DragonConnector._send_req_with_socket( + server_socket, DragonShutdownRequest() + ) except (zmq.error.ZMQError, zmq.Again) as e: # Can't use the logger as I/O file may be closed print("Could not send shutdown request to dragon server") @@ -392,7 +403,7 @@ def _dragon_cleanup( finally: print("Authenticator shutdown is complete") - if psutil.pid_exists(server_process_pid) and server_process_pid: + if server_process_pid and psutil.pid_exists(server_process_pid): try: os.kill(server_process_pid, signal.SIGINT) print("Sent SIGINT to dragon server") diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index de10e7e37..27b26d619 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -93,9 +93,11 @@ def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: RunSettings: DragonStep, } - def add_step_to_mapping_table(self, name: str, step_map: StepMap): + def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: super().add_step_to_mapping_table(name, step_map) + if step_map.step_id is None: + return if step_map.step_id.startswith("SLURM-"): slurm_step_map = StepMap( step_id=DragonLauncher._unprefix_step_id(step_map.step_id), diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index ba79060d9..9f4e36150 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -71,7 +71,7 @@ def stop(self, step_name: str) -> StepInfo: raise NotImplementedError @abc.abstractmethod - def add_step_to_mapping_table(self, name: str, step_map: StepMap): + def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: raise NotImplementedError @@ -91,7 +91,7 @@ def __init__(self) -> None: def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: raise NotImplementedError - def add_step_to_mapping_table(self, name: str, step_map: StepMap): + def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: self.step_mapping[name] = step_map # every launcher utilizing this interface must have a map diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py index 036ecdf12..cad219897 100644 --- a/smartsim/_core/launcher/local/local.py +++ b/smartsim/_core/launcher/local/local.py @@ -41,7 +41,7 @@ def __init__(self) -> None: self.task_manager = TaskManager() self.step_mapping = StepMapping() - def add_step_to_mapping_table(self, name: str, step_map: StepMap): + def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: self.step_mapping[name] = step_map def create_step(self, name: str, cwd: str, step_settings: SettingsBase) -> Step: diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index b26765ff1..e799bb299 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -216,7 +216,8 @@ def _write_sbatch_script(self) -> str: script_file.write(f"#SBATCH {opt}\n") script_file.write( - f"#SBATCH --export=ALL,SMARTSIM_DRAGON_SERVER_PATH={self.cwd},PYTHONUNBUFFERED=1\n" + f"#SBATCH --export=ALL,SMARTSIM_DRAGON_SERVER_PATH={self.cwd}," + "PYTHONUNBUFFERED=1\n" ) for cmd in self.batch_settings.preamble: diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 78e2641c5..5b28307a9 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -29,12 +29,15 @@ import os import sys import typing as t -import zmq import pytest +import zmq from smartsim._core.config.config import get_config -from smartsim._core.launcher.dragon.dragonLauncher import DragonConnector, DragonLauncher +from smartsim._core.launcher.dragon.dragonLauncher import ( + DragonConnector, + DragonLauncher, +) from smartsim._core.launcher.dragon.dragonSockets import ( get_authenticator, get_secure_socket, From 3489f4074eaa8b80b033f6af695fe73f30be8a92 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 17 Apr 2024 17:36:35 -0500 Subject: [PATCH 023/101] Escalate signals to take down Dragon server --- smartsim/_core/entrypoints/dragon.py | 5 +- .../_core/launcher/dragon/dragonBackend.py | 275 +++++++++--------- .../_core/launcher/dragon/dragonConnector.py | 17 +- smartsim/_core/utils/telemetry/util.py | 2 +- tests/test_controller_errors.py | 18 +- 5 files changed, 170 insertions(+), 147 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index b6921f4ec..6d820947b 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -77,9 +77,6 @@ def print_summary(network_interface: str, ip_address: str) -> None: HOSTNAME: {socket.gethostname()} DRAGON_SERVER_CONFIG: {json.dumps(zmq_config)} -------------------------------------- - - --------------- Output --------------- - """), ) @@ -133,6 +130,8 @@ def run( if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): logger.debug(f"Listening to {dragon_head_address}") + if not backend_updater.is_alive(): + backend_updater.start() else: logger.info("Backend shutdown has been requested") break diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 5f530ea9c..0e9f8a7b6 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -111,7 +111,7 @@ def __init__(self, pid: int) -> None: self._pid = pid self._group_infos: t.Dict[str, ProcessGroupInfo] = {} self._step_id_lock = RLock() - self._hostlist_lock = RLock() + self._queue_lock = RLock() self._step_id = 0 # hosts available for execution # dictionary maps hostname to step_id of @@ -148,7 +148,7 @@ def should_shutdown(self) -> bool: return self._shutdown_requested and self._can_shutdown def _initialize_hosts(self) -> None: - with self._hostlist_lock: + with self._queue_lock: self._hosts: t.List[str] = sorted( Node(node).hostname for node in System().nodes ) @@ -173,7 +173,7 @@ def _allocate_step( ) -> t.Optional[t.List[str]]: num_hosts: int = request.nodes - with self._hostlist_lock: + with self._queue_lock: if num_hosts <= 0 or num_hosts > len(self._free_hosts): return None to_allocate = [] @@ -206,11 +206,12 @@ def _(self, request: DragonRunRequest) -> DragonRunResponse: ) return DragonRunResponse(step_id=step_id, error_message=err) - self._queued_steps[step_id] = request - self._group_infos[step_id] = ProcessGroupInfo( - status=SmartSimStatus.STATUS_NEVER_STARTED - ) - return DragonRunResponse(step_id=step_id) + with self._queue_lock: + self._queued_steps[step_id] = request + self._group_infos[step_id] = ProcessGroupInfo( + status=SmartSimStatus.STATUS_NEVER_STARTED + ) + return DragonRunResponse(step_id=step_id) @staticmethod def _start_redirect_workers( @@ -247,152 +248,154 @@ def _start_redirect_workers( grp_redir.start() def _stop_steps(self) -> None: - while len(self._stop_requests) > 0: - request = self._stop_requests.popleft() - step_id = request.step_id - if step_id not in self._group_infos: - logger.error(f"Requested to stop non-existing step {step_id}") - continue - - logger.debug(f"Stopping step {step_id}") - if request.step_id in self._queued_steps: - self._queued_steps.pop(step_id) - else: - # Technically we could just terminate, but what if - # the application intercepts that and ignores it? - proc_group = self._group_infos[step_id].process_group - if ( - proc_group is not None - and proc_group.status not in TERMINAL_STATUSES - ): - try: - proc_group.kill() - except DragonProcessGroupError: + with self._queue_lock: + while len(self._stop_requests) > 0: + request = self._stop_requests.popleft() + step_id = request.step_id + if step_id not in self._group_infos: + logger.error(f"Requested to stop non-existing step {step_id}") + continue + + logger.debug(f"Stopping step {step_id}") + if request.step_id in self._queued_steps: + self._queued_steps.pop(step_id) + else: + # Technically we could just terminate, but what if + # the application intercepts that and ignores it? + proc_group = self._group_infos[step_id].process_group + if ( + proc_group is not None + and proc_group.status not in TERMINAL_STATUSES + ): try: - proc_group.stop() + proc_group.kill() except DragonProcessGroupError: - logger.error("Process group already stopped") + try: + proc_group.stop() + except DragonProcessGroupError: + logger.error("Process group already stopped") - self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED - self._group_infos[step_id].return_codes = [-9] + self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED + self._group_infos[step_id].return_codes = [-9] def _start_steps(self) -> None: started = [] - for step_id, request in self._queued_steps.items(): - hosts = self._allocate_step(step_id, self._queued_steps[step_id]) - if not hosts: - continue - - logger.debug(f"Step id {step_id} allocated on {hosts}") + with self._queue_lock: + for step_id, request in self._queued_steps.items(): + hosts = self._allocate_step(step_id, self._queued_steps[step_id]) + if not hosts: + continue - global_policy = Policy( - placement=Policy.Placement.HOST_NAME, host_name=hosts[0] - ) - grp = ProcessGroup( - restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy - ) + logger.debug(f"Step id {step_id} allocated on {hosts}") - policies = [] - for node_name in hosts: - local_policy = Policy( - placement=Policy.Placement.HOST_NAME, host_name=node_name + global_policy = Policy( + placement=Policy.Placement.HOST_NAME, host_name=hosts[0] ) - policies.extend([local_policy] * request.tasks_per_node) - tmp_proc = ProcessTemplate( - target=request.exe, - args=request.exe_args, - cwd=request.path, - env={**request.current_env, **request.env}, - stdout=Popen.PIPE, - stderr=Popen.PIPE, - policy=local_policy, + grp = ProcessGroup( + restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy ) - grp.add_process(nproc=request.tasks_per_node, template=tmp_proc) - try: - grp.init() - grp.start() - except Exception as e: - logger.error(e) + policies = [] + for node_name in hosts: + local_policy = Policy( + placement=Policy.Placement.HOST_NAME, host_name=node_name + ) + policies.extend([local_policy] * request.tasks_per_node) + tmp_proc = ProcessTemplate( + target=request.exe, + args=request.exe_args, + cwd=request.path, + env={**request.current_env, **request.env}, + stdout=Popen.PIPE, + stderr=Popen.PIPE, + policy=local_policy, + ) + grp.add_process(nproc=request.tasks_per_node, template=tmp_proc) - puids = None - try: - puids = grp.puids - self._group_infos[step_id] = ProcessGroupInfo( - process_group=grp, - puids=puids, - return_codes=[], - status=SmartSimStatus.STATUS_RUNNING, - hosts=hosts, - ) - self._running_steps.append(step_id) - started.append(step_id) - except Exception as e: - logger.error(e) + try: + grp.init() + grp.start() + except Exception as e: + logger.error(e) - if puids is not None: + puids = None try: - DragonBackend._start_redirect_workers( - global_policy, - policies, - puids, - request.output_file, - request.error_file, + puids = grp.puids + self._group_infos[step_id] = ProcessGroupInfo( + process_group=grp, + puids=puids, + return_codes=[], + status=SmartSimStatus.STATUS_RUNNING, + hosts=hosts, ) + self._running_steps.append(step_id) + started.append(step_id) except Exception as e: - raise IOError("Could not redirect output") from e + logger.error(e) - if started: - logger.debug(f"{started=}") + if puids is not None: + try: + DragonBackend._start_redirect_workers( + global_policy, + policies, + puids, + request.output_file, + request.error_file, + ) + except Exception as e: + raise IOError("Could not redirect output") from e - for step_id in started: - try: - self._queued_steps.pop(step_id) - except KeyError as e: - logger.error(e) + if started: + logger.debug(f"{started=}") + + for step_id in started: + try: + self._queued_steps.pop(step_id) + except KeyError as e: + logger.error(e) def _refresh_statuses(self) -> None: terminated = [] - for step_id in self._running_steps: - group_info = self._group_infos[step_id] - grp = group_info.process_group - if grp is None: - group_info.status = SmartSimStatus.STATUS_FAILED - group_info.return_codes = [-1] - elif group_info.status not in TERMINAL_STATUSES: - if grp.status == DRG_RUNNING_STATUS: - group_info.status = SmartSimStatus.STATUS_RUNNING - else: - puids = group_info.puids - if puids is not None and all(puid is not None for puid in puids): - try: - group_info.return_codes = [ - Process(None, ident=puid).returncode for puid in puids - ] - except (ValueError, TypeError) as e: - logger.error(e) - group_info.return_codes = [-1 for _ in puids] + with self._queue_lock: + for step_id in self._running_steps: + group_info = self._group_infos[step_id] + grp = group_info.process_group + if grp is None: + group_info.status = SmartSimStatus.STATUS_FAILED + group_info.return_codes = [-1] + elif group_info.status not in TERMINAL_STATUSES: + if grp.status == DRG_RUNNING_STATUS: + group_info.status = SmartSimStatus.STATUS_RUNNING else: - group_info.return_codes = [0] - if not group_info.status == SmartSimStatus.STATUS_CANCELLED: - group_info.status = ( - SmartSimStatus.STATUS_FAILED - if any(group_info.return_codes) - or grp.status == DRG_ERROR_STATUS - else SmartSimStatus.STATUS_COMPLETED - ) - - if group_info.status in TERMINAL_STATUSES: - terminated.append(step_id) - - if terminated: - logger.debug(f"{terminated=}") - for step_id in terminated: - self._running_steps.remove(step_id) - self._completed_steps.append(step_id) - group_info = self._group_infos[step_id] - if group_info is not None: - with self._hostlist_lock: + puids = group_info.puids + if puids is not None and all(puid is not None for puid in puids): + try: + group_info.return_codes = [ + Process(None, ident=puid).returncode for puid in puids + ] + except (ValueError, TypeError) as e: + logger.error(e) + group_info.return_codes = [-1 for _ in puids] + else: + group_info.return_codes = [0] + if not group_info.status == SmartSimStatus.STATUS_CANCELLED: + group_info.status = ( + SmartSimStatus.STATUS_FAILED + if any(group_info.return_codes) + or grp.status == DRG_ERROR_STATUS + else SmartSimStatus.STATUS_COMPLETED + ) + + if group_info.status in TERMINAL_STATUSES: + terminated.append(step_id) + + if terminated: + logger.debug(f"{terminated=}") + for step_id in terminated: + self._running_steps.remove(step_id) + self._completed_steps.append(step_id) + group_info = self._group_infos[step_id] + if group_info is not None: for host in group_info.hosts: logger.debug(f"Releasing host {host}") self._allocated_hosts.pop(host) @@ -423,7 +426,10 @@ def update(self) -> None: except Exception as e: logger.error(e) if self._should_update(): - self.print_status() + try: + self.print_status() + except Exception as e: + logger.error(e) @process_request.register def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: @@ -437,7 +443,8 @@ def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: @process_request.register def _(self, request: DragonStopRequest) -> DragonStopResponse: - self._stop_requests.append(request) + with self._queue_lock: + self._stop_requests.append(request) return DragonStopResponse() @process_request.register diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 3cea7f990..35ca1051a 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -407,9 +407,24 @@ def _dragon_cleanup( try: os.kill(server_process_pid, signal.SIGINT) print("Sent SIGINT to dragon server") - time.sleep(5) + time.sleep(2) if psutil.pid_exists(server_process_pid): + print("Dragon server is still alive, sending SIGINT") + os.kill(server_process_pid, signal.SIGINT) + time.sleep(10) + if psutil.pid_exists(server_process_pid): + print("Dragon server is still alive, sending SIGTERM") os.kill(server_process_pid, signal.SIGTERM) + time.sleep(5) + if psutil.pid_exists(server_process_pid): + print("Dragon server is still alive, sending SIGKILL") + os.kill(server_process_pid, signal.SIGKILL) + if psutil.pid_exists(server_process_pid): + print("Waiting for Dragon process to complete") + try: + os.waitpid(server_process_pid, os.WEXITED) + except Exception: + pass except ProcessLookupError: # Can't use the logger as I/O file may be closed print("Dragon server is not running.", flush=True) diff --git a/smartsim/_core/utils/telemetry/util.py b/smartsim/_core/utils/telemetry/util.py index 03f0b495e..cf6459cf9 100644 --- a/smartsim/_core/utils/telemetry/util.py +++ b/smartsim/_core/utils/telemetry/util.py @@ -69,7 +69,7 @@ def write_event( :param return_code: (optional) the return code of a completed task :type return_code: int|None""" - tgt_path = status_dir / f"{event_type}.json" + tgt_path = pathlib.Path(status_dir) / f"{event_type}.json" tgt_path.parent.mkdir(parents=True, exist_ok=True) try: diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index bef155d3a..48ba18cec 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -161,13 +161,14 @@ def test_duplicate_running_entity(test_dir, wlmutils, entity): def test_restarting_entity(test_dir, wlmutils, entity): """Validate restarting a completed Model/Ensemble job""" step_settings = RunSettings("echo") - step = MockStep("mock-step", test_dir, step_settings) + test_launcher = wlmutils.get_test_launcher() + if test_launcher == "dragon": + step = DragonStep("mock-step", test_dir, step_settings) + else: + step = MockStep("mock-step", test_dir, step_settings) step.meta["status_dir"] = test_dir entity.path = test_dir - test_launcher = wlmutils.get_test_launcher() controller = Controller(test_launcher) - # if test_launcher == "dragon": - # step = DragonStep("mock-step", test_dir, step_settings) controller._jobs.add_job(entity.name, job_id="1234", entity=entity) controller._jobs.move_to_completed(controller._jobs.jobs.get(entity.name)) controller._launch_step(step, entity=entity) @@ -176,13 +177,14 @@ def test_restarting_entity(test_dir, wlmutils, entity): def test_restarting_orch(test_dir, wlmutils): """Validate restarting a completed Orchestrator job""" step_settings = RunSettings("echo") - step = MockStep("mock-step", test_dir, step_settings) + test_launcher = wlmutils.get_test_launcher() + if test_launcher == "dragon": + step = DragonStep("mock-step", test_dir, step_settings) + else: + step = MockStep("mock-step", test_dir, step_settings) step.meta["status_dir"] = test_dir orc.path = test_dir - test_launcher = wlmutils.get_test_launcher() controller = Controller(test_launcher) - # if test_launcher == "dragon": - # step = DragonStep("mock-step", test_dir, step_settings) controller._jobs.add_job(orc.name, job_id="1234", entity=orc) controller._jobs.move_to_completed(controller._jobs.db_jobs.get(orc.name)) controller._launch_step(step, entity=orc) From 1aaf70421be9780d5f82b075c937eb67a7b27cce Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 17 Apr 2024 19:33:58 -0500 Subject: [PATCH 024/101] Refine signal escalation --- .../_core/launcher/dragon/dragonBackend.py | 9 ++++++-- .../_core/launcher/dragon/dragonConnector.py | 23 ++++--------------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 0e9f8a7b6..34c0a41b4 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -368,10 +368,13 @@ def _refresh_statuses(self) -> None: group_info.status = SmartSimStatus.STATUS_RUNNING else: puids = group_info.puids - if puids is not None and all(puid is not None for puid in puids): + if puids is not None and all( + puid is not None for puid in puids + ): try: group_info.return_codes = [ - Process(None, ident=puid).returncode for puid in puids + Process(None, ident=puid).returncode + for puid in puids ] except (ValueError, TypeError) as e: logger.error(e) @@ -400,6 +403,8 @@ def _refresh_statuses(self) -> None: logger.debug(f"Releasing host {host}") self._allocated_hosts.pop(host) self._free_hosts.append(host) + del group_info.process_group + group_info.process_grop = None def _update_shutdown_status(self) -> None: self._can_shutdown = all( diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 35ca1051a..9c6357298 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -404,27 +404,14 @@ def _dragon_cleanup( print("Authenticator shutdown is complete") if server_process_pid and psutil.pid_exists(server_process_pid): + print("Sending SIGINT to dragon server") try: os.kill(server_process_pid, signal.SIGINT) - print("Sent SIGINT to dragon server") time.sleep(2) - if psutil.pid_exists(server_process_pid): - print("Dragon server is still alive, sending SIGINT") - os.kill(server_process_pid, signal.SIGINT) - time.sleep(10) - if psutil.pid_exists(server_process_pid): - print("Dragon server is still alive, sending SIGTERM") - os.kill(server_process_pid, signal.SIGTERM) - time.sleep(5) - if psutil.pid_exists(server_process_pid): - print("Dragon server is still alive, sending SIGKILL") - os.kill(server_process_pid, signal.SIGKILL) - if psutil.pid_exists(server_process_pid): - print("Waiting for Dragon process to complete") - try: - os.waitpid(server_process_pid, os.WEXITED) - except Exception: - pass + os.kill(server_process_pid, signal.SIGINT) + time.sleep(10) + os.kill(server_process_pid, signal.SIGKILL) + os.waitpid(server_process_pid, os.WEXITED) except ProcessLookupError: # Can't use the logger as I/O file may be closed print("Dragon server is not running.", flush=True) From 21eb3d7bb364d1d4fee46f9150bd1e37fe8e564a Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 17 Apr 2024 19:35:44 -0500 Subject: [PATCH 025/101] Fix typo --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 34c0a41b4..33ef98fd2 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -404,7 +404,7 @@ def _refresh_statuses(self) -> None: self._allocated_hosts.pop(host) self._free_hosts.append(host) del group_info.process_group - group_info.process_grop = None + group_info.process_group = None def _update_shutdown_status(self) -> None: self._can_shutdown = all( From 201bb1108afa8924622fc8a70a1b86e23423c3b7 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Apr 2024 06:04:53 -0500 Subject: [PATCH 026/101] Add non-graceful cleanup --- smartsim/_core/entrypoints/dragon_client.py | 2 +- .../_core/launcher/dragon/dragonConnector.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py index 7b4edb6c4..35f5e6580 100644 --- a/smartsim/_core/entrypoints/dragon_client.py +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -64,7 +64,7 @@ def main(args: argparse.Namespace) -> int: requests.append(DragonShutdownRequest(immediate=False, frontend_shutdown=False)) - connector = DragonConnector() + connector = DragonConnector(graceful_cleanup=False) for request in requests: response = connector.send_request(request) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 9c6357298..761e9e2b7 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -72,7 +72,7 @@ class DragonConnector: """ - def __init__(self) -> None: + def __init__(self, graceful_cleanup: bool = True) -> None: super().__init__() self._context: zmq.Context[t.Any] = zmq.Context() self._context.setsockopt(zmq.REQ_CORRELATE, 1) @@ -90,6 +90,7 @@ def __init__(self) -> None: # but process was started by another connector self._dragon_head_pid: t.Optional[int] = None self._dragon_server_path = CONFIG.dragon_server_path + self._graceful_cleanup = graceful_cleanup logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") if self._dragon_server_path is None: raise SmartSimError( @@ -291,18 +292,20 @@ def log_dragon_outputs() -> None: server_socket=server_socket, server_process_pid=server_process_pid, server_authenticator=self._authenticator, + graceful=self._graceful_cleanup, ) else: # TODO parse output file log_dragon_outputs() raise SmartSimError("Could not receive address of Dragon head process") - def cleanup(self) -> None: + def cleanup(self, graceful: bool = True) -> None: if self._dragon_head_socket is not None and self._dragon_head_pid is not None: _dragon_cleanup( server_socket=self._dragon_head_socket, server_process_pid=self._dragon_head_pid, server_authenticator=self._authenticator, + graceful=graceful, ) self._dragon_head_socket = None self._dragon_head_pid = 0 @@ -371,6 +374,7 @@ def _dragon_cleanup( server_socket: t.Optional[zmq.Socket[t.Any]] = None, server_process_pid: t.Optional[int] = 0, server_authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None, + graceful: bool = True, ) -> None: """Clean up resources used by the launcher. :param server_socket: (optional) Socket used to connect to dragon environment @@ -406,11 +410,12 @@ def _dragon_cleanup( if server_process_pid and psutil.pid_exists(server_process_pid): print("Sending SIGINT to dragon server") try: - os.kill(server_process_pid, signal.SIGINT) - time.sleep(2) - os.kill(server_process_pid, signal.SIGINT) - time.sleep(10) - os.kill(server_process_pid, signal.SIGKILL) + if graceful: + os.kill(server_process_pid, signal.SIGINT) + time.sleep(2) + os.kill(server_process_pid, signal.SIGINT) + time.sleep(10) + os.kill(server_process_pid, signal.SIGTERM) os.waitpid(server_process_pid, os.WEXITED) except ProcessLookupError: # Can't use the logger as I/O file may be closed From a5d17dbe6d297f19d8054eb58de6f6503d04d8d7 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Apr 2024 07:02:04 -0500 Subject: [PATCH 027/101] Allow LocalStep with Dragon --- smartsim/_core/launcher/dragon/dragonConnector.py | 15 ++++++++------- smartsim/_core/launcher/dragon/dragonLauncher.py | 4 ++-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 761e9e2b7..abbe296a5 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -77,9 +77,7 @@ def __init__(self, graceful_cleanup: bool = True) -> None: self._context: zmq.Context[t.Any] = zmq.Context() self._context.setsockopt(zmq.REQ_CORRELATE, 1) self._context.setsockopt(zmq.REQ_RELAXED, 1) - self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = ( - dragonSockets.get_authenticator(self._context) - ) + self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None self._timeout = CONFIG.dragon_server_timeout self._reconnect_timeout = CONFIG.dragon_server_reconnect_timeout self._startup_timeout = CONFIG.dragon_server_startup_timeout @@ -154,7 +152,7 @@ def connect_to_dragon(self) -> None: # TODO use manager instead if self.is_connected: return - + self._authenticator = dragonSockets.get_authenticator(self._context) if self._dragon_server_path is None: raise SmartSimError("Path to Dragon server not set.") @@ -416,12 +414,15 @@ def _dragon_cleanup( os.kill(server_process_pid, signal.SIGINT) time.sleep(10) os.kill(server_process_pid, signal.SIGTERM) - os.waitpid(server_process_pid, os.WEXITED) + _, retcode = os.waitpid(server_process_pid, 0) except ProcessLookupError: # Can't use the logger as I/O file may be closed - print("Dragon server is not running.", flush=True) + print("Dragon server is not running.") finally: - print("Dragon server process shutdown is complete") + print( + f"Dragon server process shutdown is complete , return code {retcode}", + flush=True, + ) def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 27b26d619..30b6fa428 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -51,7 +51,7 @@ from ..launcher import WLMLauncher from ..pbs.pbsLauncher import PBSLauncher from ..slurm.slurmLauncher import SlurmLauncher -from ..step import DragonBatchStep, DragonStep, Step +from ..step import DragonBatchStep, DragonStep, LocalStep, Step from ..stepInfo import StepInfo from .dragonConnector import DragonConnector, _SchemaT @@ -90,7 +90,7 @@ def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: DragonRunSettings: DragonStep, SbatchSettings: DragonBatchStep, QsubBatchSettings: DragonBatchStep, - RunSettings: DragonStep, + RunSettings: LocalStep, } def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: From 2b8505fe14309e94b7dd3630d17d23a7d7a92e73 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Apr 2024 08:38:49 -0500 Subject: [PATCH 028/101] Fix possibly underfined value --- smartsim/_core/launcher/dragon/dragonConnector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index abbe296a5..9444e2872 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -407,6 +407,7 @@ def _dragon_cleanup( if server_process_pid and psutil.pid_exists(server_process_pid): print("Sending SIGINT to dragon server") + retcode = None try: if graceful: os.kill(server_process_pid, signal.SIGINT) From b5e811acb54d620db8321b58f345c89ea3cb2894 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Apr 2024 09:51:13 -0500 Subject: [PATCH 029/101] Mitigate test RunSettings issue --- conftest.py | 4 +-- .../_core/launcher/dragon/dragonConnector.py | 30 +++++++++++-------- tests/on_wlm/test_dragon.py | 1 + tests/test_controller_errors.py | 10 ++----- 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/conftest.py b/conftest.py index 2014441cd..b5d6a39da 100644 --- a/conftest.py +++ b/conftest.py @@ -343,8 +343,9 @@ def get_base_run_settings( return settings if test_launcher == "dragon": run_args = {"nodes": nodes} + run_args = {"ntasks": ntasks} run_args.update(kwargs) - settings = RunSettings(exe, args, run_command="", run_args=run_args) + settings = DragonRunSettings(exe, args, run_args=run_args) return settings if test_launcher == "pbs": if shutil.which("aprun"): @@ -806,7 +807,6 @@ def global_dragon_teardown() -> None: dragon_connector = DragonConnector() dragon_connector.ensure_connected() dragon_connector.cleanup() - time.sleep(5) @pytest.fixture diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 9444e2872..4ad0493c3 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -133,12 +133,15 @@ def _set_timeout(self, timeout: int) -> None: self._context.setsockopt(zmq.SNDTIMEO, value=timeout) self._context.setsockopt(zmq.RCVTIMEO, value=timeout) if self._authenticator is not None and self._authenticator.thread is not None: - self._authenticator.thread.authenticator.zap_socket.setsockopt( - zmq.SNDTIMEO, timeout - ) - self._authenticator.thread.authenticator.zap_socket.setsockopt( - zmq.RCVTIMEO, timeout - ) + try: + self._authenticator.thread.authenticator.zap_socket.setsockopt( + zmq.SNDTIMEO, timeout + ) + self._authenticator.thread.authenticator.zap_socket.setsockopt( + zmq.RCVTIMEO, timeout + ) + except zmq.ZMQError: + logger.debug("ZAP socket is not set") def ensure_connected(self) -> None: if not self.is_connected: @@ -152,7 +155,6 @@ def connect_to_dragon(self) -> None: # TODO use manager instead if self.is_connected: return - self._authenticator = dragonSockets.get_authenticator(self._context) if self._dragon_server_path is None: raise SmartSimError("Path to Dragon server not set.") @@ -160,6 +162,7 @@ def connect_to_dragon(self) -> None: dragon_config_log = path / CONFIG.dragon_log_filename if dragon_config_log.is_file(): + dragon_confs = self._parse_launched_dragon_server_info_from_files( [dragon_config_log] ) @@ -173,20 +176,20 @@ def connect_to_dragon(self) -> None: ) try: self._set_timeout(self._reconnect_timeout) + self._authenticator = dragonSockets.get_authenticator(self._context) self._handshake(dragon_conf["address"]) except SmartSimError as e: logger.warning(e) + finally: logger.debug("Closing ZAP socket") if ( self._authenticator is not None and self._authenticator.thread is not None ): - self._authenticator.thread.authenticator.zap_socket.close() - logger.debug("Getting new auth") - self._authenticator = dragonSockets.get_authenticator( - self._context - ) - finally: + try: + self._authenticator.thread.authenticator.zap_socket.close() + except Exception: + logger.debug("Could not close ZAP socket") self._set_timeout(self._timeout) if self.is_connected: return @@ -210,6 +213,7 @@ def connect_to_dragon(self) -> None: if address is not None: self._set_timeout(self._startup_timeout) + self._authenticator = dragonSockets.get_authenticator(self._context) connector_socket = dragonSockets.get_secure_socket( self._context, zmq.REP, True ) diff --git a/tests/on_wlm/test_dragon.py b/tests/on_wlm/test_dragon.py index d5f94c639..7809d6647 100644 --- a/tests/on_wlm/test_dragon.py +++ b/tests/on_wlm/test_dragon.py @@ -54,6 +54,7 @@ def test_dragon_global_path(global_dragon_teardown, wlmutils, test_dir, monkeypa def test_dragon_exp_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch): monkeypatch.delenv("SMARTSIM_DRAGON_SERVER_PATH", raising=False) + monkeypatch.delenv("SMARTSIM_DRAGON_SERVER_PATH_EXP", raising=False) exp: Experiment = Experiment( "test_dragon_connection", exp_path=test_dir, diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index 48ba18cec..8ddf02db5 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -162,10 +162,7 @@ def test_restarting_entity(test_dir, wlmutils, entity): """Validate restarting a completed Model/Ensemble job""" step_settings = RunSettings("echo") test_launcher = wlmutils.get_test_launcher() - if test_launcher == "dragon": - step = DragonStep("mock-step", test_dir, step_settings) - else: - step = MockStep("mock-step", test_dir, step_settings) + step = MockStep("mock-step", test_dir, step_settings) step.meta["status_dir"] = test_dir entity.path = test_dir controller = Controller(test_launcher) @@ -178,10 +175,7 @@ def test_restarting_orch(test_dir, wlmutils): """Validate restarting a completed Orchestrator job""" step_settings = RunSettings("echo") test_launcher = wlmutils.get_test_launcher() - if test_launcher == "dragon": - step = DragonStep("mock-step", test_dir, step_settings) - else: - step = MockStep("mock-step", test_dir, step_settings) + step = MockStep("mock-step", test_dir, step_settings) step.meta["status_dir"] = test_dir orc.path = test_dir controller = Controller(test_launcher) From dfaa278dc069238bf7183fe34bf40df386eb0317 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Apr 2024 17:05:12 -0500 Subject: [PATCH 030/101] Add thin layer of resilience to udpater thread --- smartsim/_core/config/config.py | 4 -- smartsim/_core/entrypoints/dragon.py | 23 ++++++-- .../_core/launcher/dragon/dragonBackend.py | 58 +++++++++++++------ .../_core/launcher/dragon/dragonConnector.py | 50 ++++++++-------- .../_core/launcher/dragon/dragonSockets.py | 22 +++++-- tests/test_dragon_launcher.py | 15 ++--- 6 files changed, 111 insertions(+), 61 deletions(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 6e892ab27..c26e77274 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -160,10 +160,6 @@ def dragon_server_path(self) -> t.Optional[str]: os.getenv("SMARTSIM_DRAGON_SERVER_PATH_EXP", None), ) - @property - def dragon_server_reconnect_timeout(self) -> int: - return int(os.getenv("SMARTSIM_DRAGON_RECONNECT_TIMEOUT", "20000")) - @property def dragon_server_timeout(self) -> int: return int(os.getenv("SMARTSIM_DRAGON_TIMEOUT", "20000")) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 6d820947b..77a901ecf 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -81,6 +81,7 @@ def print_summary(network_interface: str, ip_address: str) -> None: ) +# pylint: disable-next=too-many-statements def run( zmq_context: "zmq.Context[t.Any]", dragon_head_address: str, @@ -109,13 +110,16 @@ def run( logger.debug(f"Listening to {dragon_head_address}") + updater_last_beat = dragon_backend.last_heartbeat + grace_period = 2 + no_update_steps = 0 + while not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): try: req = server.recv() logger.debug(f"Received {type(req).__name__} {req}") except zmq.Again: if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): - # logger.debug(f"Listening to {dragon_head_address}") continue logger.info("Shutdown has been requested") break @@ -130,8 +134,19 @@ def run( if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): logger.debug(f"Listening to {dragon_head_address}") - if not backend_updater.is_alive(): - backend_updater.start() + if updater_last_beat <= dragon_backend.last_heartbeat: + no_update_steps += 1 + if no_update_steps >= grace_period: + logger.debug("Restarting updater") + del backend_updater + backend_updater = ContextThread( + name="DragonBackend", daemon=True, target=dragon_backend.update + ) + backend_updater.start() + no_update_steps = 0 + else: + updater_last_beat = dragon_backend.last_heartbeat + no_update_steps = 0 else: logger.info("Backend shutdown has been requested") break @@ -144,7 +159,7 @@ def run( if not dragon_backend.frontend_shutdown: logger.info("Frontend will have to be shut down externally") while True: - time.sleep(1) + time.sleep(5) logger.info("Waiting for external shutdown") diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 33ef98fd2..0fa95b4ca 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -124,6 +124,8 @@ def __init__(self, pid: int) -> None: self._running_steps: t.List[str] = [] self._completed_steps: t.List[str] = [] self._last_update_time: float = time.time_ns() / 1e9 + self._last_beat: float = 0.0 + self._heartbeat() num_hosts = len(self._hosts) host_string = str(num_hosts) + (" hosts" if num_hosts > 1 else " host") self._shutdown_requested = False @@ -139,10 +141,17 @@ def print_status(self) -> None: logger.debug(f"Group infos: {self._group_infos}") logger.debug(f"There are {len(self._queued_steps)} queued steps") + def _heartbeat(self): + self._last_beat = time.time_ns() / 1e9 + @property def frontend_shutdown(self) -> bool: return bool(self._frontend_shutdown) + @property + def last_heartbeat(self) -> float: + return self._last_beat + @property def should_shutdown(self) -> bool: return self._shutdown_requested and self._can_shutdown @@ -244,8 +253,14 @@ def _start_redirect_workers( policy=pol, ), ) - grp_redir.init() - grp_redir.start() + try: + grp_redir.init() + time.sleep(0.1) + grp_redir.start() + except Exception as e: + raise IOError( + f"Could not redirect stdout and stderr for PUIDS {puids}" + ) from e def _stop_steps(self) -> None: with self._queue_lock: @@ -343,7 +358,7 @@ def _start_steps(self) -> None: request.error_file, ) except Exception as e: - raise IOError("Could not redirect output") from e + logger.error(e) if started: logger.debug(f"{started=}") @@ -351,8 +366,10 @@ def _start_steps(self) -> None: for step_id in started: try: self._queued_steps.pop(step_id) - except KeyError as e: - logger.error(e) + except KeyError: + logger.error( + "Tried to allocate the same step twice, step id {step_id}" + ) def _refresh_statuses(self) -> None: terminated = [] @@ -401,9 +418,15 @@ def _refresh_statuses(self) -> None: if group_info is not None: for host in group_info.hosts: logger.debug(f"Releasing host {host}") - self._allocated_hosts.pop(host) + try: + self._allocated_hosts.pop(host) + except KeyError: + logger.error(f"Tried to free same host twice :{host}") self._free_hosts.append(host) - del group_info.process_group + try: + del group_info.process_group + except Exception: + logger.error("Could not delete Process Group") group_info.process_group = None def _update_shutdown_status(self) -> None: @@ -413,9 +436,8 @@ def _update_shutdown_status(self) -> None: ) def _should_update(self) -> bool: - current_time = time.time_ns() / 1e9 - if current_time - self._last_update_time > 10: - self._last_update_time = current_time + if self._last_beat - self._last_update_time > 10: + self._last_update_time = self._last_beat return True return False @@ -423,6 +445,7 @@ def update(self) -> None: logger.debug("Dragon Backend update thread started") while True: try: + self._heartbeat() self._stop_steps() self._start_steps() self._refresh_statuses() @@ -438,13 +461,14 @@ def update(self) -> None: @process_request.register def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: - return DragonUpdateStatusResponse( - statuses={ - step_id: self._group_infos[step_id].smartsim_info - for step_id in request.step_ids - if step_id in self._group_infos - } - ) + with self._queue_lock: + return DragonUpdateStatusResponse( + statuses={ + step_id: self._group_infos[step_id].smartsim_info + for step_id in request.step_ids + if step_id in self._group_infos + } + ) @process_request.register def _(self, request: DragonStopRequest) -> DragonStopResponse: diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 4ad0493c3..ae2761fc9 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -69,17 +69,15 @@ class DragonConnector: """This class encapsulates the functionality needed to start a Dragon server and communicate with it. - """ def __init__(self, graceful_cleanup: bool = True) -> None: super().__init__() - self._context: zmq.Context[t.Any] = zmq.Context() + self._context: zmq.Context[t.Any] = zmq.Context.instance() self._context.setsockopt(zmq.REQ_CORRELATE, 1) self._context.setsockopt(zmq.REQ_RELAXED, 1) self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None self._timeout = CONFIG.dragon_server_timeout - self._reconnect_timeout = CONFIG.dragon_server_reconnect_timeout self._startup_timeout = CONFIG.dragon_server_startup_timeout self._set_timeout(self._timeout) self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None @@ -118,13 +116,9 @@ def _handshake(self, address: str) -> None: ) except (zmq.ZMQError, zmq.Again) as e: logger.debug(e) - if self._authenticator is not None: - try: - self._authenticator.stop() - except zmq.Again: - logger.error("Could not stop authenticator") self._dragon_head_socket.close() self._dragon_head_socket = None + raise SmartSimError( f"Unsuccessful handshake with Dragon server at address {address}" ) from e @@ -141,7 +135,7 @@ def _set_timeout(self, timeout: int) -> None: zmq.RCVTIMEO, timeout ) except zmq.ZMQError: - logger.debug("ZAP socket is not set") + pass def ensure_connected(self) -> None: if not self.is_connected: @@ -149,6 +143,25 @@ def ensure_connected(self) -> None: if not self.is_connected: raise SmartSimError("Could not connect to Dragon server") + def _get_new_authenticator(self): + if self._authenticator is not None: + if self._authenticator.thread is not None: + try: + logger.debug("Closing ZAP socket") + self._authenticator.thread.authenticator.zap_socket.close() + except Exception as e: + logger.debug(f"Could not close ZAP socket, {e}") + try: + self._authenticator.stop() + except zmq.Again: + logger.debug("Could not stop authenticator") + try: + self._authenticator = dragonSockets.get_authenticator(self._context) + return + except RuntimeError as e: + logger.error("Could not get authenticator") + raise e from None + # pylint: disable-next=too-many-statements,too-many-locals def connect_to_dragon(self) -> None: with DRG_LOCK: @@ -175,23 +188,15 @@ def connect_to_dragon(self) -> None: f" is still up at address {dragon_conf['address']}." ) try: - self._set_timeout(self._reconnect_timeout) - self._authenticator = dragonSockets.get_authenticator(self._context) + self._set_timeout(self._timeout) + self._get_new_authenticator() self._handshake(dragon_conf["address"]) except SmartSimError as e: - logger.warning(e) + logger.debug(e) finally: - logger.debug("Closing ZAP socket") - if ( - self._authenticator is not None - and self._authenticator.thread is not None - ): - try: - self._authenticator.thread.authenticator.zap_socket.close() - except Exception: - logger.debug("Could not close ZAP socket") self._set_timeout(self._timeout) if self.is_connected: + logger.debug("Connected to existing Dragon server") return path.mkdir(parents=True, exist_ok=True) @@ -212,8 +217,7 @@ def connect_to_dragon(self) -> None: connector_socket: t.Optional[zmq.Socket[t.Any]] = None if address is not None: self._set_timeout(self._startup_timeout) - - self._authenticator = dragonSockets.get_authenticator(self._context) + self._get_new_authenticator() connector_socket = dragonSockets.get_secure_socket( self._context, zmq.REP, True ) diff --git a/smartsim/_core/launcher/dragon/dragonSockets.py b/smartsim/_core/launcher/dragon/dragonSockets.py index 1de9d8819..188ea2a2f 100644 --- a/smartsim/_core/launcher/dragon/dragonSockets.py +++ b/smartsim/_core/launcher/dragon/dragonSockets.py @@ -26,6 +26,7 @@ import typing as t +import zmq import zmq.auth.thread from smartsim._core.config.config import get_config @@ -39,9 +40,10 @@ from zmq import Context from zmq.sugar.socket import Socket - logger = get_logger(__name__) +AUTHENTICATOR: t.Optional["zmq.auth.thread.ThreadAuthenticator"] = None + def as_server( socket: "Socket[t.Any]", @@ -115,20 +117,28 @@ def get_authenticator( :type context: zmq.Context :returns: the activated `Authenticator` :rtype: zmq.auth.thread.ThreadAuthenticator""" + # pylint: disable-next=global-statement + global AUTHENTICATOR + + if AUTHENTICATOR is not None: + if AUTHENTICATOR.is_alive(): + return AUTHENTICATOR + del AUTHENTICATOR + config = get_config() key_manager = KeyManager(config, as_client=True) server_keys, client_keys = key_manager.get_keys() logger.debug(f"Applying keys to authenticator: {server_keys}, {client_keys}") - authenticator = zmq.auth.thread.ThreadAuthenticator(context) + AUTHENTICATOR = zmq.auth.thread.ThreadAuthenticator(context) # allow all keys in the client key directory to connect logger.debug(f"Securing with client keys in {key_manager.client_keys_dir}") - authenticator.configure_curve(domain="*", location=key_manager.client_keys_dir) + AUTHENTICATOR.configure_curve(domain="*", location=key_manager.client_keys_dir) - if not authenticator.is_alive(): + if not AUTHENTICATOR.is_alive(): logger.debug("Starting authenticator") - authenticator.start() + AUTHENTICATOR.start() - return authenticator + return AUTHENTICATOR diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 5b28307a9..81a86da9e 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -34,10 +34,7 @@ import zmq from smartsim._core.config.config import get_config -from smartsim._core.launcher.dragon.dragonLauncher import ( - DragonConnector, - DragonLauncher, -) +from smartsim._core.launcher.dragon.dragonLauncher import DragonConnector from smartsim._core.launcher.dragon.dragonSockets import ( get_authenticator, get_secure_socket, @@ -209,12 +206,14 @@ def test_dragon_connect_bind_address(monkeypatch: pytest.MonkeyPatch, test_dir: ctx.setattr("zmq.Context.socket", mock_socket) ctx.setattr("subprocess.Popen", lambda *args, **kwargs: MockPopen()) - dragon_launcher = DragonLauncher() - dragon_launcher._connector.connect_to_dragon() + dragon_connector = DragonConnector() + dragon_connector.connect_to_dragon() chosen_port = int(mock_socket.bind_address.split(":")[-1]) assert chosen_port >= 5995 + dragon_connector._authenticator.stop() + @pytest.mark.parametrize( "socket_type, is_server", @@ -251,6 +250,8 @@ def test_secure_socket_authenticator_setup( # ensure authenticator is using the expected set of keys assert authenticator.cfg_kwargs.get("location", "") == km.client_keys_dir + authenticator.stop() + @pytest.mark.parametrize( "as_server", @@ -357,5 +358,5 @@ def fn(*args, **kwargs): # connect executes the complete handshake and raises an exception if comms fails connector.connect_to_dragon() finally: - connector.cleanup() + connector.cleanup(False) ... From 91352478f45817e08076019cc1201a2bcfd9b4d2 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Apr 2024 17:06:33 -0500 Subject: [PATCH 031/101] Type checking --- smartsim/_core/launcher/dragon/dragonBackend.py | 4 ++-- smartsim/_core/launcher/dragon/dragonConnector.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 0fa95b4ca..fa9dc8a7b 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -123,9 +123,9 @@ def __init__(self, pid: int) -> None: self._stop_requests: t.Deque[DragonStopRequest] = collections.deque() self._running_steps: t.List[str] = [] self._completed_steps: t.List[str] = [] - self._last_update_time: float = time.time_ns() / 1e9 self._last_beat: float = 0.0 self._heartbeat() + self._last_update_time = self._last_beat num_hosts = len(self._hosts) host_string = str(num_hosts) + (" hosts" if num_hosts > 1 else " host") self._shutdown_requested = False @@ -141,7 +141,7 @@ def print_status(self) -> None: logger.debug(f"Group infos: {self._group_infos}") logger.debug(f"There are {len(self._queued_steps)} queued steps") - def _heartbeat(self): + def _heartbeat(self) -> None: self._last_beat = time.time_ns() / 1e9 @property diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index ae2761fc9..cc6c7cd00 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -143,7 +143,7 @@ def ensure_connected(self) -> None: if not self.is_connected: raise SmartSimError("Could not connect to Dragon server") - def _get_new_authenticator(self): + def _get_new_authenticator(self) -> None: if self._authenticator is not None: if self._authenticator.thread is not None: try: From b67e663b26ed984c0658598b81e0ba102e6e1bc9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 19 Apr 2024 13:25:31 -0500 Subject: [PATCH 032/101] Adjustments to timing, signals, heartbeats --- smartsim/_core/entrypoints/dragon.py | 33 +++++++---------- smartsim/_core/entrypoints/dragon_client.py | 10 +++--- .../_core/launcher/dragon/dragonBackend.py | 35 +++++++++++++------ .../_core/launcher/dragon/dragonConnector.py | 2 +- 4 files changed, 44 insertions(+), 36 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 77a901ecf..6eb4423cb 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -110,10 +110,6 @@ def run( logger.debug(f"Listening to {dragon_head_address}") - updater_last_beat = dragon_backend.last_heartbeat - grace_period = 2 - no_update_steps = 0 - while not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): try: req = server.recv() @@ -134,33 +130,28 @@ def run( if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): logger.debug(f"Listening to {dragon_head_address}") - if updater_last_beat <= dragon_backend.last_heartbeat: - no_update_steps += 1 - if no_update_steps >= grace_period: - logger.debug("Restarting updater") - del backend_updater - backend_updater = ContextThread( - name="DragonBackend", daemon=True, target=dragon_backend.update - ) - backend_updater.start() - no_update_steps = 0 - else: - updater_last_beat = dragon_backend.last_heartbeat - no_update_steps = 0 + heartbeat_delay = dragon_backend.current_time - dragon_backend.last_heartbeat + if heartbeat_delay > 10.0: + logger.debug( + f"Restarting updater after {heartbeat_delay:.2f} seconds of inactivity." + ) + del backend_updater + backend_updater = ContextThread( + name="DragonBackend", daemon=True, target=dragon_backend.update + ) + backend_updater.start() else: logger.info("Backend shutdown has been requested") break - try: + if backend_updater.is_alive(): del backend_updater - except Exception: - logger.debug("Could not delete backend updater thread") if not dragon_backend.frontend_shutdown: logger.info("Frontend will have to be shut down externally") while True: - time.sleep(5) logger.info("Waiting for external shutdown") + time.sleep(5) def main(args: argparse.Namespace) -> int: diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py index 35f5e6580..b34c53f69 100644 --- a/smartsim/_core/entrypoints/dragon_client.py +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -62,7 +62,7 @@ def main(args: argparse.Namespace) -> int: for req_str in req_strings: requests.append(request_registry.from_string(req_str)) - requests.append(DragonShutdownRequest(immediate=False, frontend_shutdown=False)) + requests.append(DragonShutdownRequest(immediate=False, frontend_shutdown=True)) connector = DragonConnector(graceful_cleanup=False) @@ -83,19 +83,21 @@ def main(args: argparse.Namespace) -> int: while True: # pylint: disable-next=protected-access try: - time.sleep(1) + time.sleep(5) connector.send_request(DragonHandshakeRequest()) except zmq.error.Again: - print("Could not reach server, assuming backend has shut down") + print("Could not reach server, assuming backend has shut down", flush=True) + # os.waitpid(connector._dragon_head_pid, 0) break print("Server has finished.") + return 0 if __name__ == "__main__": os.environ["PYTHONUNBUFFERED"] = "1" - logger.info("Dragon server started") + logger.info("Dragon client started") parser = argparse.ArgumentParser( prefix_chars="+", diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index fa9dc8a7b..d728e574f 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -74,6 +74,7 @@ class ProcessGroupInfo: puids: t.Optional[t.List[t.Optional[int]]] = None # puids can be None return_codes: t.Optional[t.List[int]] = None hosts: t.List[str] = field(default_factory=list) + redir_workers: t.Optional[ProcessGroup] = None @property def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.Optional[t.List[int]]]: @@ -142,7 +143,7 @@ def print_status(self) -> None: logger.debug(f"There are {len(self._queued_steps)} queued steps") def _heartbeat(self) -> None: - self._last_beat = time.time_ns() / 1e9 + self._last_beat = self.current_time @property def frontend_shutdown(self) -> bool: @@ -156,6 +157,10 @@ def last_heartbeat(self) -> float: def should_shutdown(self) -> bool: return self._shutdown_requested and self._can_shutdown + @property + def current_time(self) -> float: + return time.time_ns() / 1e9 + def _initialize_hosts(self) -> None: with self._queue_lock: self._hosts: t.List[str] = sorted( @@ -229,7 +234,7 @@ def _start_redirect_workers( puids: t.List[int], out_file: t.Optional[str], err_file: t.Optional[str], - ) -> None: + ) -> ProcessGroup: grp_redir = ProcessGroup(restart=False, policy=global_policy) for pol, puid in zip(policies, puids): proc = Process(None, ident=puid) @@ -262,6 +267,8 @@ def _start_redirect_workers( f"Could not redirect stdout and stderr for PUIDS {puids}" ) from e + return grp_redir + def _stop_steps(self) -> None: with self._queue_lock: while len(self._stop_requests) > 0: @@ -280,7 +287,7 @@ def _stop_steps(self) -> None: proc_group = self._group_infos[step_id].process_group if ( proc_group is not None - and proc_group.status not in TERMINAL_STATUSES + and proc_group.status == DRG_RUNNING_STATUS ): try: proc_group.kill() @@ -289,6 +296,13 @@ def _stop_steps(self) -> None: proc_group.stop() except DragonProcessGroupError: logger.error("Process group already stopped") + redir_group = self._group_infos[step_id].redir_workers + if redir_group is not None: + try: + redir_group.join(0.1) + del redir_group + except Exception as e: + logger.error(e) self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED self._group_infos[step_id].return_codes = [-9] @@ -350,13 +364,14 @@ def _start_steps(self) -> None: if puids is not None: try: - DragonBackend._start_redirect_workers( + redir_grp = DragonBackend._start_redirect_workers( global_policy, policies, puids, request.output_file, request.error_file, ) + self._group_infos[step_id].redir_workers = redir_grp except Exception as e: logger.error(e) @@ -421,17 +436,16 @@ def _refresh_statuses(self) -> None: try: self._allocated_hosts.pop(host) except KeyError: - logger.error(f"Tried to free same host twice :{host}") + logger.error(f"Tried to free same host twice: {host}") self._free_hosts.append(host) - try: - del group_info.process_group - except Exception: - logger.error("Could not delete Process Group") group_info.process_group = None + group_info.redir_workers = None def _update_shutdown_status(self) -> None: self._can_shutdown = all( grp_info.status in TERMINAL_STATUSES + and grp_info.process_group is None + and grp_info.redir_workers is None for grp_info in self._group_infos.values() ) @@ -443,7 +457,7 @@ def _should_update(self) -> bool: def update(self) -> None: logger.debug("Dragon Backend update thread started") - while True: + while not self.should_shutdown: try: self._heartbeat() self._stop_steps() @@ -458,6 +472,7 @@ def update(self) -> None: self.print_status() except Exception as e: logger.error(e) + logger.debug("Dragon Backend update thread stopping") @process_request.register def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index cc6c7cd00..5719caddb 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -414,8 +414,8 @@ def _dragon_cleanup( print("Authenticator shutdown is complete") if server_process_pid and psutil.pid_exists(server_process_pid): - print("Sending SIGINT to dragon server") retcode = None + print("Terminating Dragon server") try: if graceful: os.kill(server_process_pid, signal.SIGINT) From 893f5dbd66f98144d4a9af4dac48c70089ec1e1d Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 23 Apr 2024 16:59:38 -0500 Subject: [PATCH 033/101] First pass after reviews --- smartsim/_core/config/config.py | 4 - smartsim/_core/entrypoints/dragon.py | 45 ++-- smartsim/_core/entrypoints/dragon_client.py | 23 +- smartsim/_core/entrypoints/redis.py | 9 +- .../_core/launcher/dragon/dragonConnector.py | 209 +++++++++--------- .../_core/launcher/dragon/dragonSockets.py | 15 +- smartsim/_core/launcher/launcher.py | 14 +- smartsim/_core/launcher/local/local.py | 5 +- .../_core/launcher/slurm/slurmLauncher.py | 1 - smartsim/_core/launcher/step/dragonStep.py | 13 +- tests/test_dragon_launcher.py | 19 +- 11 files changed, 189 insertions(+), 168 deletions(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index c26e77274..c15b26d13 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -172,10 +172,6 @@ def dragon_server_startup_timeout(self) -> int: def dragon_transport(self) -> str: return os.getenv("SMARTSIM_DRAGON_TRANSPORT", "hsta") - @property - def dragon_log_level(self) -> str: - return os.getenv("SMARTSIM_DRAGON_LOG_LEVEL", "NONE") - @property def log_level(self) -> str: return os.environ.get("SMARTSIM_LOG_LEVEL", "info") diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 6eb4423cb..58637d818 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -81,31 +81,29 @@ def print_summary(network_interface: str, ip_address: str) -> None: ) +def start_updater( + backend: DragonBackend, updater: t.Optional[ContextThread] +) -> ContextThread: + if updater is not None: + updater.join(0.1) + updater = ContextThread(name="DragonBackend", daemon=True, target=backend.update) + updater.start() + return updater + + # pylint: disable-next=too-many-statements def run( zmq_context: "zmq.Context[t.Any]", dragon_head_address: str, dragon_pid: int, - zmq_authenticator: "zmq.auth.thread.ThreadAuthenticator", ) -> None: logger.debug(f"Opening socket {dragon_head_address}") - zmq_context.setsockopt(zmq.SNDTIMEO, value=-1) - zmq_context.setsockopt(zmq.RCVTIMEO, value=-1) - zmq_context.setsockopt(zmq.REQ_CORRELATE, 1) - zmq_context.setsockopt(zmq.REQ_RELAXED, 1) - zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.SNDTIMEO, -1) - zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.RCVTIMEO, -1) - dragon_head_socket = dragonSockets.get_secure_socket(zmq_context, zmq.REP, True) dragon_head_socket.bind(dragon_head_address) dragon_backend = DragonBackend(pid=dragon_pid) - backend_updater = ContextThread( - name="DragonBackend", daemon=True, target=dragon_backend.update - ) - backend_updater.start() - + backend_updater = start_updater(dragon_backend, None) server = dragonSockets.as_server(dragon_head_socket) logger.debug(f"Listening to {dragon_head_address}") @@ -115,10 +113,7 @@ def run( req = server.recv() logger.debug(f"Received {type(req).__name__} {req}") except zmq.Again: - if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): - continue - logger.info("Shutdown has been requested") - break + continue resp = dragon_backend.process_request(req) @@ -130,22 +125,21 @@ def run( if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): logger.debug(f"Listening to {dragon_head_address}") - heartbeat_delay = dragon_backend.current_time - dragon_backend.last_heartbeat + heartbeat_delay = ( + dragon_backend.current_time - dragon_backend.last_heartbeat + ) if heartbeat_delay > 10.0: logger.debug( - f"Restarting updater after {heartbeat_delay:.2f} seconds of inactivity." - ) - del backend_updater - backend_updater = ContextThread( - name="DragonBackend", daemon=True, target=dragon_backend.update + f"Restarting updater after {heartbeat_delay:.2f} " + "seconds of inactivity." ) - backend_updater.start() + backend_updater = start_updater(dragon_backend, backend_updater) else: logger.info("Backend shutdown has been requested") break if backend_updater.is_alive(): - del backend_updater + backend_updater.join(1) if not dragon_backend.frontend_shutdown: logger.info("Frontend will have to be shut down externally") @@ -206,7 +200,6 @@ def main(args: argparse.Namespace) -> int: zmq_context=zmq_context, dragon_head_address=dragon_head_address, dragon_pid=response.dragon_pid, - zmq_authenticator=zmq_authenticator, ) except Exception as e: logger.error(f"Dragon server failed with {e}", exc_info=True) diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py index b34c53f69..c026e7e4f 100644 --- a/smartsim/_core/entrypoints/dragon_client.py +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -56,8 +56,18 @@ def main(args: argparse.Namespace) -> int: requests: t.List[DragonRequest] = [] - with open(args.submit, "r", encoding="utf-8") as request_file: - req_strings = json.load(fp=request_file) + try: + with open(args.submit, "r", encoding="utf-8") as request_file: + req_strings = json.load(fp=request_file) + except FileNotFoundError: + logger.error( + "Could not find file with run requests," + f"please check whether {args.submit} exists." + ) + return 1 + except json.JSONDecodeError: + logger.error(f"Could not decode request file {args.submit}.") + return 1 for req_str in req_strings: requests.append(request_registry.from_string(req_str)) @@ -73,24 +83,21 @@ def main(args: argparse.Namespace) -> int: logger.info("Terminated sending requests, waiting for Dragon Server to complete") - # pylint: disable-next=protected-access - if connector._dragon_head_pid is None: + if not connector.can_monitor: logger.error( "Could not get Dragon Server PID and will not be able to monitor it." ) return 1 while True: - # pylint: disable-next=protected-access try: time.sleep(5) connector.send_request(DragonHandshakeRequest()) except zmq.error.Again: - print("Could not reach server, assuming backend has shut down", flush=True) - # os.waitpid(connector._dragon_head_pid, 0) + logger.debug("Could not reach server, assuming backend has shut down") break - print("Server has finished.") + logger.info("Server has finished.") return 0 diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index b856fdb43..6904d434a 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -180,14 +180,7 @@ def cleanup() -> None: action="store_true", help="Specify if this orchestrator shard is part of a cluster", ) - # parser.add_argument( - # "+redirect_output", - # action="store_true", - # help=( - # "Specify if stdout and stderr of this script should be redirected. " - # + "Only needed for dragon launcher." - # ), - # ) + args_ = parser.parse_args() # make sure to register the cleanup before the start diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 5719caddb..554afbf87 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -72,14 +72,12 @@ class DragonConnector: """ def __init__(self, graceful_cleanup: bool = True) -> None: - super().__init__() self._context: zmq.Context[t.Any] = zmq.Context.instance() self._context.setsockopt(zmq.REQ_CORRELATE, 1) self._context.setsockopt(zmq.REQ_RELAXED, 1) self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None - self._timeout = CONFIG.dragon_server_timeout self._startup_timeout = CONFIG.dragon_server_startup_timeout - self._set_timeout(self._timeout) + self._set_timeout(CONFIG.dragon_server_timeout) self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None # Returned by dragon head, useful if shutdown is to be requested @@ -101,6 +99,10 @@ def __init__(self, graceful_cleanup: bool = True) -> None: def is_connected(self) -> bool: return self._dragon_head_socket is not None + @property + def can_monitor(self) -> bool: + return self._dragon_head_pid is not None + def _handshake(self, address: str) -> None: self._dragon_head_socket = dragonSockets.get_secure_socket( self._context, zmq.REQ, False @@ -162,7 +164,56 @@ def _get_new_authenticator(self) -> None: logger.error("Could not get authenticator") raise e from None - # pylint: disable-next=too-many-statements,too-many-locals + @staticmethod + def _get_dragon_log_level() -> str: + smartsim_to_dragon = { + "developer": "INFO", + "debug": "NONE", + "info": "NONE", + "quiet": "NONE", + } + return smartsim_to_dragon.get(CONFIG.log_level, "NONE") + + def _connect_to_existing_server(self, path: Path) -> None: + dragon_config_log = path / CONFIG.dragon_log_filename + + if dragon_config_log.is_file(): + + dragon_confs = self._parse_launched_dragon_server_info_from_files( + [dragon_config_log] + ) + logger.debug(dragon_confs) + for dragon_conf in dragon_confs: + if not "address" in dragon_conf: + continue + logger.debug( + "Found dragon server config file. Checking if the server" + f" is still up at address {dragon_conf['address']}." + ) + try: + self._set_timeout(CONFIG.dragon_server_timeout) + self._get_new_authenticator() + self._handshake(dragon_conf["address"]) + except SmartSimError as e: + logger.error(e) + finally: + self._set_timeout(CONFIG.dragon_server_timeout) + if self.is_connected: + logger.debug("Connected to existing Dragon server") + return + + def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: + connector_socket: t.Optional[zmq.Socket[t.Any]] = None + self._set_timeout(self._startup_timeout) + self._get_new_authenticator() + connector_socket = dragonSockets.get_secure_socket(self._context, zmq.REP, True) + logger.debug(f"Binding connector to {socket_addr}") + connector_socket.bind(socket_addr) + if connector_socket is None: + raise SmartSimError("Socket failed to initialize") + + return connector_socket + def connect_to_dragon(self) -> None: with DRG_LOCK: # TODO use manager instead @@ -172,64 +223,38 @@ def connect_to_dragon(self) -> None: raise SmartSimError("Path to Dragon server not set.") path = _resolve_dragon_path(self._dragon_server_path) - dragon_config_log = path / CONFIG.dragon_log_filename - - if dragon_config_log.is_file(): - dragon_confs = self._parse_launched_dragon_server_info_from_files( - [dragon_config_log] - ) - logger.debug(dragon_confs) - for dragon_conf in dragon_confs: - if not "address" in dragon_conf: - continue - logger.debug( - "Found dragon server config file. Checking if the server" - f" is still up at address {dragon_conf['address']}." - ) - try: - self._set_timeout(self._timeout) - self._get_new_authenticator() - self._handshake(dragon_conf["address"]) - except SmartSimError as e: - logger.debug(e) - finally: - self._set_timeout(self._timeout) - if self.is_connected: - logger.debug("Connected to existing Dragon server") - return + self._connect_to_existing_server(path) + if self.is_connected: + return path.mkdir(parents=True, exist_ok=True) + local_address = get_best_interface_and_address().address + if local_address is None: + # TODO parse output file + raise SmartSimError( + "Could not determine SmartSim's local address, " + "the Dragon server could not be started." + ) + # find first available port >= 5995 + port = find_free_port(start=5995) + socket_addr = f"tcp://{local_address}:{port}" + connector_socket = self._start_connector_socket(socket_addr) + cmd = [ "dragon", "-t", CONFIG.dragon_transport, "-l", - CONFIG.dragon_log_level, + DragonConnector._get_dragon_log_level(), sys.executable, "-m", "smartsim._core.entrypoints.dragon", + "+launching_address", + socket_addr, ] - address = get_best_interface_and_address().address - socket_addr = "" - connector_socket: t.Optional[zmq.Socket[t.Any]] = None - if address is not None: - self._set_timeout(self._startup_timeout) - self._get_new_authenticator() - connector_socket = dragonSockets.get_secure_socket( - self._context, zmq.REP, True - ) - - # find first available port >= 5995 - port = find_free_port(start=5995) - socket_addr = f"tcp://{address}:{port}" - logger.debug(f"Binding connector to {socket_addr}") - - connector_socket.bind(socket_addr) - cmd += ["+launching_address", socket_addr] - dragon_out_file = path / "dragon_head.out" dragon_err_file = path / "dragon_head.err" @@ -252,58 +277,44 @@ def connect_to_dragon(self) -> None: start_new_session=True, ) - if connector_socket is None: - raise SmartSimError("Socket failed to initialize") - - def log_dragon_outputs() -> None: - if self._dragon_head_process: - self._dragon_head_process.wait(1.0) - if self._dragon_head_process.stdout: - for line in iter( - self._dragon_head_process.stdout.readline, b"" - ): - logger.info(line.decode("utf-8").rstrip()) - if self._dragon_head_process.stderr: - for line in iter( - self._dragon_head_process.stderr.readline, b"" - ): - logger.warning(line.decode("utf-8").rstrip()) - logger.warning(self._dragon_head_process.returncode) - - if address is not None: - server = dragonSockets.as_server(connector_socket) - logger.debug(f"Listening to {socket_addr}") - request = _assert_schema_type(server.recv(), DragonBootstrapRequest) - - logger.debug(f"Connecting to {request.address}") - server.send( - DragonBootstrapResponse(dragon_pid=self._dragon_head_process.pid) + server = dragonSockets.as_server(connector_socket) + logger.debug(f"Listening to {socket_addr}") + request = _assert_schema_type(server.recv(), DragonBootstrapRequest) + server.send( + DragonBootstrapResponse(dragon_pid=self._dragon_head_process.pid) + ) + connector_socket.close() + logger.debug(f"Connecting to {request.address}") + self._set_timeout(CONFIG.dragon_server_timeout) + self._handshake(request.address) + + # Only the Connector which started the server is + # responsible of it, that's why we register the + # cleanup in this code branch. + # The cleanup function should not have references + # to this object to avoid Garbage Collector lockup + server_socket = self._dragon_head_socket + server_process_pid = self._dragon_head_process.pid + + if server_socket is not None and self._dragon_head_process is not None: + atexit.register( + _dragon_cleanup, + server_socket=server_socket, + server_process_pid=server_process_pid, + server_authenticator=self._authenticator, + graceful=self._graceful_cleanup, ) - - connector_socket.close() - self._set_timeout(self._timeout) - self._handshake(request.address) - - # Only the Connector which started the server is - # responsible of it, that's why we register the - # cleanup in this code branch. - # The cleanup function should not have references - # to this object to avoid Garbage Collector lockup - server_socket = self._dragon_head_socket - server_process_pid = self._dragon_head_process.pid - - if server_socket is not None and self._dragon_head_process is not None: - atexit.register( - _dragon_cleanup, - server_socket=server_socket, - server_process_pid=server_process_pid, - server_authenticator=self._authenticator, - graceful=self._graceful_cleanup, - ) + elif self._dragon_head_process is not None: + self._dragon_head_process.wait(1.0) + if self._dragon_head_process.stdout: + for line in iter(self._dragon_head_process.stdout.readline, b""): + logger.info(line.decode("utf-8").rstrip()) + if self._dragon_head_process.stderr: + for line in iter(self._dragon_head_process.stderr.readline, b""): + logger.warning(line.decode("utf-8").rstrip()) + logger.warning(self._dragon_head_process.returncode) else: - # TODO parse output file - log_dragon_outputs() - raise SmartSimError("Could not receive address of Dragon head process") + logger.warning("Could not start Dragon server as subprocess") def cleanup(self, graceful: bool = True) -> None: if self._dragon_head_socket is not None and self._dragon_head_pid is not None: @@ -429,7 +440,7 @@ def _dragon_cleanup( print("Dragon server is not running.") finally: print( - f"Dragon server process shutdown is complete , return code {retcode}", + f"Dragon server process shutdown is complete, return code {retcode}", flush=True, ) diff --git a/smartsim/_core/launcher/dragon/dragonSockets.py b/smartsim/_core/launcher/dragon/dragonSockets.py index 188ea2a2f..9f6680e2c 100644 --- a/smartsim/_core/launcher/dragon/dragonSockets.py +++ b/smartsim/_core/launcher/dragon/dragonSockets.py @@ -123,7 +123,15 @@ def get_authenticator( if AUTHENTICATOR is not None: if AUTHENTICATOR.is_alive(): return AUTHENTICATOR - del AUTHENTICATOR + try: + logger.debug("Stopping authenticator") + AUTHENTICATOR.thread.authenticator.zap_socket.close() + AUTHENTICATOR.thread.join(0.1) + AUTHENTICATOR = None + except Exception as e: + logger.debug(e) + finally: + logger.debug("Stopped authenticator") config = get_config() @@ -137,8 +145,7 @@ def get_authenticator( logger.debug(f"Securing with client keys in {key_manager.client_keys_dir}") AUTHENTICATOR.configure_curve(domain="*", location=key_manager.client_keys_dir) - if not AUTHENTICATOR.is_alive(): - logger.debug("Starting authenticator") - AUTHENTICATOR.start() + logger.debug("Starting authenticator") + AUTHENTICATOR.start() return AUTHENTICATOR diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index 9f4e36150..db6edbe03 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -70,9 +70,16 @@ def run(self, step: Step) -> t.Optional[str]: def stop(self, step_name: str) -> StepInfo: raise NotImplementedError - @abc.abstractmethod def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: - raise NotImplementedError + """Add a StepMap to the Launcher step mapping table + making it monitor the step. + + :param name: name of step to be added + :type name: str + :param step_map: step map of added step + :type step_map: StepMap + """ + self.step_mapping[name] = step_map class WLMLauncher(Launcher): # cov-wlm @@ -91,9 +98,6 @@ def __init__(self) -> None: def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: raise NotImplementedError - def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: - self.step_mapping[name] = step_map - # every launcher utilizing this interface must have a map # of supported RunSettings types (see slurmLauncher.py for ex) def create_step( diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py index cad219897..96778ec0d 100644 --- a/smartsim/_core/launcher/local/local.py +++ b/smartsim/_core/launcher/local/local.py @@ -30,7 +30,7 @@ from ..launcher import Launcher from ..step import LocalStep, Step from ..stepInfo import StepInfo, UnmanagedStepInfo -from ..stepMapping import StepMap, StepMapping +from ..stepMapping import StepMapping from ..taskManager import TaskManager @@ -41,9 +41,6 @@ def __init__(self) -> None: self.task_manager = TaskManager() self.step_mapping = StepMapping() - def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: - self.step_mapping[name] = step_map - def create_step(self, name: str, cwd: str, step_settings: SettingsBase) -> Step: """Create a job step to launch an entity locally diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index 0ae327030..a25e62806 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -138,7 +138,6 @@ def run(self, step: Step) -> t.Optional[str]: # Launch a batch step with Slurm if isinstance(step, SbatchStep): # wait for batch step to submit successfully - print(cmd_list, step.cwd) return_code, out, err = self.task_manager.start_and_wait(cmd_list, step.cwd) if return_code != 0: raise LauncherError(f"Sbatch submission failed\n {out}\n {err}") diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index e799bb299..1ec9989f5 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -159,7 +159,6 @@ def _dragon_entrypoint_cmd(request_file: str) -> str: "smartsim._core.entrypoints.dragon_client", "+submit", f"{request_file}", - "\n", ] return " ".join(cmd) @@ -197,7 +196,7 @@ def _write_request_file(self) -> str: return request_file def _write_sbatch_script(self) -> str: - """Write the batch script + """Write the PBS batch script :return: batch script path after writing :rtype: str @@ -223,11 +222,13 @@ def _write_sbatch_script(self) -> str: for cmd in self.batch_settings.preamble: script_file.write(f"{cmd}\n") - script_file.write(DragonBatchStep._dragon_entrypoint_cmd(request_file)) + script_file.write( + DragonBatchStep._dragon_entrypoint_cmd(request_file) + "\n" + ) return batch_script def _write_qsub_script(self) -> str: - """Write the batch script + """Write the Slurm batch script :return: batch script path after writing :rtype: str @@ -251,6 +252,8 @@ def _write_qsub_script(self) -> str: for cmd in self.batch_settings.preamble: script_file.write(f"{cmd}\n") - script_file.write(DragonBatchStep._dragon_entrypoint_cmd(request_file)) + script_file.write( + DragonBatchStep._dragon_entrypoint_cmd(request_file) + "\n" + ) return batch_script diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 81a86da9e..aef43aa89 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -28,6 +28,7 @@ import multiprocessing as mp import os import sys +import time import typing as t import pytest @@ -62,6 +63,17 @@ def pid(self) -> int: def returncode(self) -> int: return 0 + @property + def stdout(self): + return None + + @property + def stderr(self): + return None + + def wait(self, timeout: float) -> None: + time.sleep(timeout) + class MockSocket: def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: @@ -119,7 +131,6 @@ def mock_dragon_env(test_dir, *args, **kwargs): """Create a mock dragon environment that can talk to the launcher through ZMQ""" logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) - try: addr = "127.0.0.1" callback_port = kwargs["port"] @@ -286,7 +297,8 @@ def test_secure_socket_setup( def test_secure_socket(test_dir: str, monkeypatch: pytest.MonkeyPatch): """Ensure the authenticator created by the secure socket factory method is fully configured and started when returned to a client""" - + logger = logging.getLogger(__name__) + logging.basicConfig(level=logging.DEBUG) with monkeypatch.context() as ctx: # make sure we don't touch "real keys" during a test ctx.setenv("SMARTSIM_KEY_PATH", test_dir) @@ -308,7 +320,7 @@ def test_secure_socket(test_dir: str, monkeypatch: pytest.MonkeyPatch): received_msg = server.recv_string() assert received_msg == to_send - print("server receieved: ", received_msg) + logger.debug("server receieved: ", received_msg) finally: if authenticator: authenticator.stop() @@ -322,7 +334,6 @@ def test_secure_socket(test_dir: str, monkeypatch: pytest.MonkeyPatch): def test_dragon_launcher_handshake(monkeypatch: pytest.MonkeyPatch, test_dir: str): """Test that a real handshake between a launcher & dragon environment completes successfully using secure sockets""" - context = zmq.Context() addr = "127.0.0.1" bootstrap_port = find_free_port(start=5995) From 4884ad8f369d16aa687c0741a70b846bc540d99f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 24 Apr 2024 02:33:53 -0500 Subject: [PATCH 034/101] Fix log --- tests/test_dragon_launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index aef43aa89..96ae6768b 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -320,7 +320,7 @@ def test_secure_socket(test_dir: str, monkeypatch: pytest.MonkeyPatch): received_msg = server.recv_string() assert received_msg == to_send - logger.debug("server receieved: ", received_msg) + logger.debug(f"server received: {received_msg}") finally: if authenticator: authenticator.stop() From 01a5433d02f1143266b8a03cfe06cab410567aa7 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 24 Apr 2024 14:48:56 -0700 Subject: [PATCH 035/101] Fixes for perlmutter --- smartsim/_core/config/config.py | 2 +- smartsim/_core/entrypoints/dragon.py | 9 +-- .../_core/launcher/dragon/dragonConnector.py | 81 +++++++++++-------- .../_core/launcher/dragon/dragonSockets.py | 13 ++- 4 files changed, 65 insertions(+), 40 deletions(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index c15b26d13..97a08eb1a 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -162,7 +162,7 @@ def dragon_server_path(self) -> t.Optional[str]: @property def dragon_server_timeout(self) -> int: - return int(os.getenv("SMARTSIM_DRAGON_TIMEOUT", "20000")) + return int(os.getenv("SMARTSIM_DRAGON_TIMEOUT", "30000")) @property def dragon_server_startup_timeout(self) -> int: diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 58637d818..e89a2e34e 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -98,7 +98,6 @@ def run( dragon_pid: int, ) -> None: logger.debug(f"Opening socket {dragon_head_address}") - dragon_head_socket = dragonSockets.get_secure_socket(zmq_context, zmq.REP, True) dragon_head_socket.bind(dragon_head_address) dragon_backend = DragonBackend(pid=dragon_pid) @@ -158,8 +157,8 @@ def main(args: argparse.Namespace) -> int: if args.launching_address: zmq_context = zmq.Context() - zmq_context.setsockopt(zmq.SNDTIMEO, value=-1) - zmq_context.setsockopt(zmq.RCVTIMEO, value=-1) + zmq_context.setsockopt(zmq.SNDTIMEO, value=30000) + zmq_context.setsockopt(zmq.RCVTIMEO, value=30000) zmq_context.setsockopt(zmq.REQ_CORRELATE, 1) zmq_context.setsockopt(zmq.REQ_RELAXED, 1) @@ -169,9 +168,7 @@ def main(args: argparse.Namespace) -> int: else: dragon_head_address += ":5555" - zmq_authenticator = dragonSockets.get_authenticator(zmq_context) - zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.SNDTIMEO, -1) - zmq_authenticator.thread.authenticator.zap_socket.setsockopt(zmq.RCVTIMEO, -1) + zmq_authenticator = dragonSockets.get_authenticator(zmq_context, timeout=-1) logger.debug("Getting launcher socket") launcher_socket = dragonSockets.get_secure_socket(zmq_context, zmq.REQ, False) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 554afbf87..33b2cbab1 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -43,11 +43,10 @@ import zmq import zmq.auth.thread -from smartsim._core.launcher.dragon import dragonSockets -from smartsim.error.errors import SmartSimError - +from ...._core.launcher.dragon import dragonSockets +from ....error.errors import SmartSimError from ....log import get_logger -from ...config import CONFIG +from ...config import get_config from ...schemas import ( DragonBootstrapRequest, DragonBootstrapResponse, @@ -76,14 +75,14 @@ def __init__(self, graceful_cleanup: bool = True) -> None: self._context.setsockopt(zmq.REQ_CORRELATE, 1) self._context.setsockopt(zmq.REQ_RELAXED, 1) self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None - self._startup_timeout = CONFIG.dragon_server_startup_timeout - self._set_timeout(CONFIG.dragon_server_timeout) + config = get_config() + self._reset_timeout(config.dragon_server_timeout) self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None # Returned by dragon head, useful if shutdown is to be requested # but process was started by another connector self._dragon_head_pid: t.Optional[int] = None - self._dragon_server_path = CONFIG.dragon_server_path + self._dragon_server_path = config.dragon_server_path self._graceful_cleanup = graceful_cleanup logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") if self._dragon_server_path is None: @@ -125,7 +124,7 @@ def _handshake(self, address: str) -> None: f"Unsuccessful handshake with Dragon server at address {address}" ) from e - def _set_timeout(self, timeout: int) -> None: + def _reset_timeout(self, timeout: int = get_config().dragon_server_timeout) -> None: self._context.setsockopt(zmq.SNDTIMEO, value=timeout) self._context.setsockopt(zmq.RCVTIMEO, value=timeout) if self._authenticator is not None and self._authenticator.thread is not None: @@ -145,7 +144,9 @@ def ensure_connected(self) -> None: if not self.is_connected: raise SmartSimError("Could not connect to Dragon server") - def _get_new_authenticator(self) -> None: + def _get_new_authenticator( + self, timeout: int = get_config().dragon_server_timeout + ) -> None: if self._authenticator is not None: if self._authenticator.thread is not None: try: @@ -158,7 +159,9 @@ def _get_new_authenticator(self) -> None: except zmq.Again: logger.debug("Could not stop authenticator") try: - self._authenticator = dragonSockets.get_authenticator(self._context) + self._authenticator = dragonSockets.get_authenticator( + self._context, timeout + ) return except RuntimeError as e: logger.error("Could not get authenticator") @@ -172,10 +175,11 @@ def _get_dragon_log_level() -> str: "info": "NONE", "quiet": "NONE", } - return smartsim_to_dragon.get(CONFIG.log_level, "NONE") + return smartsim_to_dragon.get(get_config().log_level, "NONE") def _connect_to_existing_server(self, path: Path) -> None: - dragon_config_log = path / CONFIG.dragon_log_filename + config = get_config() + dragon_config_log = path / config.dragon_log_filename if dragon_config_log.is_file(): @@ -191,21 +195,22 @@ def _connect_to_existing_server(self, path: Path) -> None: f" is still up at address {dragon_conf['address']}." ) try: - self._set_timeout(CONFIG.dragon_server_timeout) + self._reset_timeout() self._get_new_authenticator() self._handshake(dragon_conf["address"]) except SmartSimError as e: logger.error(e) finally: - self._set_timeout(CONFIG.dragon_server_timeout) + self._reset_timeout(config.dragon_server_timeout) if self.is_connected: logger.debug("Connected to existing Dragon server") return def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: + config = get_config() connector_socket: t.Optional[zmq.Socket[t.Any]] = None - self._set_timeout(self._startup_timeout) - self._get_new_authenticator() + self._reset_timeout(config.dragon_server_startup_timeout) + self._get_new_authenticator(config.dragon_server_startup_timeout) connector_socket = dragonSockets.get_secure_socket(self._context, zmq.REP, True) logger.debug(f"Binding connector to {socket_addr}") connector_socket.bind(socket_addr) @@ -215,6 +220,7 @@ def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: return connector_socket def connect_to_dragon(self) -> None: + config = get_config() with DRG_LOCK: # TODO use manager instead if self.is_connected: @@ -245,7 +251,7 @@ def connect_to_dragon(self) -> None: cmd = [ "dragon", "-t", - CONFIG.dragon_transport, + config.dragon_transport, "-l", DragonConnector._get_dragon_log_level(), sys.executable, @@ -285,7 +291,7 @@ def connect_to_dragon(self) -> None: ) connector_socket.close() logger.debug(f"Connecting to {request.address}") - self._set_timeout(CONFIG.dragon_server_timeout) + self._reset_timeout(config.dragon_server_timeout) self._handshake(request.address) # Only the Connector which started the server is @@ -424,29 +430,40 @@ def _dragon_cleanup( finally: print("Authenticator shutdown is complete") - if server_process_pid and psutil.pid_exists(server_process_pid): - retcode = None - print("Terminating Dragon server") + if server_process_pid: try: - if graceful: - os.kill(server_process_pid, signal.SIGINT) - time.sleep(2) - os.kill(server_process_pid, signal.SIGINT) - time.sleep(10) - os.kill(server_process_pid, signal.SIGTERM) _, retcode = os.waitpid(server_process_pid, 0) - except ProcessLookupError: - # Can't use the logger as I/O file may be closed - print("Dragon server is not running.") - finally: print( f"Dragon server process shutdown is complete, return code {retcode}", flush=True, ) + except Exception as e: + logger.debug(e) + + # TODO remove this code once we are sure that it is not needed anymore + # if server_process_pid and psutil.pid_exists(server_process_pid): + # retcode = None + # print("Terminating Dragon server") + # try: + # if graceful: + # os.kill(server_process_pid, signal.SIGINT) + # time.sleep(2) + # os.kill(server_process_pid, signal.SIGINT) + # time.sleep(20) + # os.kill(server_process_pid, signal.SIGTERM) + # _, retcode = os.waitpid(server_process_pid, 0) + # except ProcessLookupError: + # # Can't use the logger as I/O file may be closed + # print("Dragon server is not running.") + # finally: + # print( + # f"Dragon server process shutdown is complete, return code {retcode}", + # flush=True, + # ) def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: - dragon_server_path = CONFIG.dragon_server_path or os.path.join( + dragon_server_path = get_config().dragon_server_path or os.path.join( fallback, ".smartsim", "dragon" ) dragon_server_paths = dragon_server_path.split(":") diff --git a/smartsim/_core/launcher/dragon/dragonSockets.py b/smartsim/_core/launcher/dragon/dragonSockets.py index 9f6680e2c..ab4ecd00c 100644 --- a/smartsim/_core/launcher/dragon/dragonSockets.py +++ b/smartsim/_core/launcher/dragon/dragonSockets.py @@ -109,7 +109,7 @@ def get_secure_socket( def get_authenticator( - context: "zmq.Context[t.Any]", + context: "zmq.Context[t.Any]", timeout: int = get_config().dragon_server_timeout ) -> "zmq.auth.thread.ThreadAuthenticator": """Create an authenticator to handle encryption of ZMQ communications @@ -141,6 +141,14 @@ def get_authenticator( AUTHENTICATOR = zmq.auth.thread.ThreadAuthenticator(context) + ctx_sndtimeo = context.getsockopt(zmq.SNDTIMEO) + ctx_rcvtimeo = context.getsockopt(zmq.RCVTIMEO) + + AUTHENTICATOR.context.setsockopt(zmq.SNDTIMEO, timeout) + AUTHENTICATOR.context.setsockopt(zmq.RCVTIMEO, timeout) + AUTHENTICATOR.context.setsockopt(zmq.REQ_CORRELATE, 1) + AUTHENTICATOR.context.setsockopt(zmq.REQ_RELAXED, 1) + # allow all keys in the client key directory to connect logger.debug(f"Securing with client keys in {key_manager.client_keys_dir}") AUTHENTICATOR.configure_curve(domain="*", location=key_manager.client_keys_dir) @@ -148,4 +156,7 @@ def get_authenticator( logger.debug("Starting authenticator") AUTHENTICATOR.start() + context.setsockopt(zmq.SNDTIMEO, ctx_sndtimeo) + context.setsockopt(zmq.RCVTIMEO, ctx_rcvtimeo) + return AUTHENTICATOR From 612b73426b2a0e99a9083db3186b8410e1c178b9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 25 Apr 2024 11:22:31 -0700 Subject: [PATCH 036/101] Address more review comments --- smartsim/_core/entrypoints/dragon_client.py | 2 +- .../_core/launcher/dragon/dragonBackend.py | 39 +++++----- .../_core/launcher/dragon/dragonConnector.py | 32 ++------- .../_core/launcher/dragon/dragonLauncher.py | 71 +++++++++---------- tests/on_wlm/test_dragon.py | 31 +++++--- tests/on_wlm/test_simple_entity_launch.py | 16 ++--- tests/test_controller_errors.py | 4 +- 7 files changed, 90 insertions(+), 105 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py index c026e7e4f..4043d5308 100644 --- a/smartsim/_core/entrypoints/dragon_client.py +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -74,7 +74,7 @@ def main(args: argparse.Namespace) -> int: requests.append(DragonShutdownRequest(immediate=False, frontend_shutdown=True)) - connector = DragonConnector(graceful_cleanup=False) + connector = DragonConnector() for request in requests: response = connector.send_request(request) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index d728e574f..76c35c2a9 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -111,7 +111,6 @@ class DragonBackend: def __init__(self, pid: int) -> None: self._pid = pid self._group_infos: t.Dict[str, ProcessGroupInfo] = {} - self._step_id_lock = RLock() self._queue_lock = RLock() self._step_id = 0 # hosts available for execution @@ -131,23 +130,27 @@ def __init__(self, pid: int) -> None: host_string = str(num_hosts) + (" hosts" if num_hosts > 1 else " host") self._shutdown_requested = False self._can_shutdown = False - self._frontend_shutdown: t.Optional[bool] = None + self._frontend_shutdown: bool = False logger.debug(f"{host_string} available for execution: {self._hosts}") - def print_status(self) -> None: - logger.debug(f"System hosts: {self._hosts}") - logger.debug(f"Free hosts: {list(self._free_hosts)}") - logger.debug(f"Allocated hosts: {self._allocated_hosts}") - logger.debug(f"Running steps: {self._running_steps}") - logger.debug(f"Group infos: {self._group_infos}") - logger.debug(f"There are {len(self._queued_steps)} queued steps") + def __str__(self) -> str: + return self.get_status_message() + + def get_status_message(self) -> str: + msg = [f"System hosts: {self._hosts}"] + msg.append(f"Free hosts: {list(self._free_hosts)}") + msg.append(f"Allocated hosts: {self._allocated_hosts}") + msg.append(f"Running steps: {self._running_steps}") + msg.append(f"Group infos: {self._group_infos}") + msg.append(f"There are {len(self._queued_steps)} queued steps") + return "\n".join(msg) def _heartbeat(self) -> None: self._last_beat = self.current_time @property def frontend_shutdown(self) -> bool: - return bool(self._frontend_shutdown) + return self._frontend_shutdown @property def last_heartbeat(self) -> float: @@ -198,10 +201,9 @@ def _allocate_step( return to_allocate def _get_new_id(self) -> str: - with self._step_id_lock: - step_id = create_short_id_str() + "-" + str(self._step_id) - self._step_id += 1 - return step_id + step_id = create_short_id_str() + "-" + str(self._step_id) + self._step_id += 1 + return step_id @functools.singledispatchmethod # Deliberately suppressing errors so that overloads have the same signature @@ -449,7 +451,7 @@ def _update_shutdown_status(self) -> None: for grp_info in self._group_infos.values() ) - def _should_update(self) -> bool: + def _should_print_status(self) -> bool: if self._last_beat - self._last_update_time > 10: self._last_update_time = self._last_beat return True @@ -464,13 +466,12 @@ def update(self) -> None: self._start_steps() self._refresh_statuses() self._update_shutdown_status() - time.sleep(0.1) except Exception as e: logger.error(e) - if self._should_update(): + if self._should_print_status(): try: - self.print_status() - except Exception as e: + logger.debug(str(self)) + except ValueError as e: logger.error(e) logger.debug("Dragon Backend update thread stopping") diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 33b2cbab1..2a201c76e 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -34,7 +34,6 @@ import signal import subprocess import sys -import time import typing as t from pathlib import Path from threading import RLock @@ -70,7 +69,7 @@ class DragonConnector: to start a Dragon server and communicate with it. """ - def __init__(self, graceful_cleanup: bool = True) -> None: + def __init__(self) -> None: self._context: zmq.Context[t.Any] = zmq.Context.instance() self._context.setsockopt(zmq.REQ_CORRELATE, 1) self._context.setsockopt(zmq.REQ_RELAXED, 1) @@ -83,7 +82,6 @@ def __init__(self, graceful_cleanup: bool = True) -> None: # but process was started by another connector self._dragon_head_pid: t.Optional[int] = None self._dragon_server_path = config.dragon_server_path - self._graceful_cleanup = graceful_cleanup logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") if self._dragon_server_path is None: raise SmartSimError( @@ -308,7 +306,6 @@ def connect_to_dragon(self) -> None: server_socket=server_socket, server_process_pid=server_process_pid, server_authenticator=self._authenticator, - graceful=self._graceful_cleanup, ) elif self._dragon_head_process is not None: self._dragon_head_process.wait(1.0) @@ -322,13 +319,12 @@ def connect_to_dragon(self) -> None: else: logger.warning("Could not start Dragon server as subprocess") - def cleanup(self, graceful: bool = True) -> None: + def cleanup(self) -> None: if self._dragon_head_socket is not None and self._dragon_head_pid is not None: _dragon_cleanup( server_socket=self._dragon_head_socket, server_process_pid=self._dragon_head_pid, server_authenticator=self._authenticator, - graceful=graceful, ) self._dragon_head_socket = None self._dragon_head_pid = 0 @@ -397,7 +393,6 @@ def _dragon_cleanup( server_socket: t.Optional[zmq.Socket[t.Any]] = None, server_process_pid: t.Optional[int] = 0, server_authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None, - graceful: bool = True, ) -> None: """Clean up resources used by the launcher. :param server_socket: (optional) Socket used to connect to dragon environment @@ -418,6 +413,8 @@ def _dragon_cleanup( # Can't use the logger as I/O file may be closed print("Could not send shutdown request to dragon server") print(f"ZMQ error: {e}", flush=True) + if server_process_pid and psutil.pid_exists(server_process_pid): + os.kill(server_process_pid, signal.SIGKILL) finally: print("Sending shutdown request is complete") @@ -440,27 +437,6 @@ def _dragon_cleanup( except Exception as e: logger.debug(e) - # TODO remove this code once we are sure that it is not needed anymore - # if server_process_pid and psutil.pid_exists(server_process_pid): - # retcode = None - # print("Terminating Dragon server") - # try: - # if graceful: - # os.kill(server_process_pid, signal.SIGINT) - # time.sleep(2) - # os.kill(server_process_pid, signal.SIGINT) - # time.sleep(20) - # os.kill(server_process_pid, signal.SIGTERM) - # _, retcode = os.waitpid(server_process_pid, 0) - # except ProcessLookupError: - # # Can't use the logger as I/O file may be closed - # print("Dragon server is not running.") - # finally: - # print( - # f"Dragon server process shutdown is complete, return code {retcode}", - # flush=True, - # ) - def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: dragon_server_path = get_config().dragon_server_path or os.path.join( diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 30b6fa428..46405348c 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -30,7 +30,7 @@ import typing as t from ...._core.launcher.stepMapping import StepMap -from ....error import LauncherError +from ....error import LauncherError, SmartSimError from ....log import get_logger from ....settings import ( DragonRunSettings, @@ -98,20 +98,20 @@ def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: if step_map.step_id is None: return + sublauncher: t.Optional[t.Union[SlurmLauncher, PBSLauncher]] = None if step_map.step_id.startswith("SLURM-"): - slurm_step_map = StepMap( - step_id=DragonLauncher._unprefix_step_id(step_map.step_id), - task_id=step_map.task_id, - managed=step_map.managed, - ) - self._slurm_launcher.add_step_to_mapping_table(name, slurm_step_map) + sublauncher = self._slurm_launcher elif step_map.step_id.startswith("PBS-"): - pbs_step_map = StepMap( - step_id=DragonLauncher._unprefix_step_id(step_map.step_id), - task_id=step_map.task_id, - managed=step_map.managed, - ) - self._pbs_launcher.add_step_to_mapping_table(name, pbs_step_map) + sublauncher = self._pbs_launcher + else: + raise ValueError(f"Step id {step_map.step_id} is not valid.") + + sublauncher_step_map = StepMap( + step_id=DragonLauncher._unprefix_step_id(step_map.step_id), + task_id=step_map.task_id, + managed=step_map.managed, + ) + sublauncher.add_step_to_mapping_table(name, sublauncher_step_map) def run(self, step: Step) -> t.Optional[str]: """Run a job step through Slurm @@ -133,31 +133,30 @@ def run(self, step: Step) -> t.Optional[str]: out, err = step.get_output_files() if isinstance(step, DragonBatchStep): + # wait for batch step to submit successfully + sublauncher_step_id: t.Optional[str] = None + return_code, out, err = self.task_manager.start_and_wait(cmd, step.cwd) + if return_code != 0: + raise LauncherError(f"Sbatch submission failed\n {out}\n {err}") + if out: + sublauncher_step_id = out.strip() + logger.debug( + f"Gleaned batch job id: {sublauncher_step_id} for {step.name}" + ) + + if sublauncher_step_id is None: + raise SmartSimError("Could not get step id for batch step") + if isinstance(step.batch_settings, SbatchSettings): - # wait for batch step to submit successfully - return_code, out, err = self.task_manager.start_and_wait(cmd, step.cwd) - if return_code != 0: - raise LauncherError(f"Sbatch submission failed\n {out}\n {err}") - if out: - slurm_step_id = out.strip() - logger.debug(f"Gleaned batch job id: {step_id} for {step.name}") - - self._slurm_launcher.step_mapping.add( - step.name, slurm_step_id, task_id, step.managed - ) - step_id = "SLURM-" + slurm_step_id + self._slurm_launcher.step_mapping.add( + step.name, sublauncher_step_id, task_id, step.managed + ) + step_id = "SLURM-" + sublauncher_step_id elif isinstance(step.batch_settings, QsubBatchSettings): - # wait for batch step to submit successfully - return_code, out, err = self.task_manager.start_and_wait(cmd, step.cwd) - if return_code != 0: - raise LauncherError(f"Qsub batch submission failed\n {out}\n {err}") - if out: - pbs_step_id = out.strip() - logger.debug(f"Gleaned batch job id: {step_id} for {step.name}") - self._pbs_launcher.step_mapping.add( - step.name, pbs_step_id, task_id, step.managed - ) - step_id = "PBS-" + pbs_step_id + self._pbs_launcher.step_mapping.add( + step.name, sublauncher_step_id, task_id, step.managed + ) + step_id = "PBS-" + sublauncher_step_id elif isinstance(step, DragonStep): run_args = step.run_settings.run_args env = step.run_settings.env_vars diff --git a/tests/on_wlm/test_dragon.py b/tests/on_wlm/test_dragon.py index 7809d6647..69d2203d2 100644 --- a/tests/on_wlm/test_dragon.py +++ b/tests/on_wlm/test_dragon.py @@ -47,9 +47,13 @@ def test_dragon_global_path(global_dragon_teardown, wlmutils, test_dir, monkeypa exp.generate(model) exp.start(model, block=True) - assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED - launcher: DragonLauncher = exp._control._launcher - launcher.cleanup() + try: + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED + except Exception as e: + raise e from None + finally: + launcher: DragonLauncher = exp._control._launcher + launcher.cleanup() def test_dragon_exp_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch): @@ -65,10 +69,13 @@ def test_dragon_exp_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch exp.generate(model) exp.start(model, block=True) - assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED - - launcher: DragonLauncher = exp._control._launcher - launcher.cleanup() + try: + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED + except Exception as e: + raise e from None + finally: + launcher: DragonLauncher = exp._control._launcher + launcher.cleanup() def test_dragon_cannot_honor(wlmutils, test_dir): @@ -84,6 +91,10 @@ def test_dragon_cannot_honor(wlmutils, test_dir): exp.generate(model) exp.start(model, block=True) - assert exp.get_status(model)[0] == SmartSimStatus.STATUS_FAILED - launcher: DragonLauncher = exp._control._launcher - launcher.cleanup() + try: + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_FAILED + except Exception as e: + raise e from None + finally: + launcher: DragonLauncher = exp._control._launcher + launcher.cleanup() diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index 09fca3dd9..a01b6259e 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -118,21 +118,21 @@ def test_summary(fileutils, test_dir, wlmutils): exp_name = "test-launch-summary" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) - sleep = fileutils.get_test_conf_path("sleep.py") + sleep_exp = fileutils.get_test_conf_path("sleep.py") bad = fileutils.get_test_conf_path("bad.py") - sleep_settings = exp.create_run_settings("python", f"{sleep} --time=3") + sleep_settings = exp.create_run_settings("python", f"{sleep_exp} --time=3") sleep_settings.set_tasks(1) bad_settings = exp.create_run_settings("python", f"{bad} --time=6") bad_settings.set_tasks(1) - sleep = exp.create_model("sleep", path=test_dir, run_settings=sleep_settings) + sleep_exp = exp.create_model("sleep", path=test_dir, run_settings=sleep_settings) bad = exp.create_model("bad", path=test_dir, run_settings=bad_settings) # start and poll - exp.start(sleep, bad) + exp.start(sleep_exp, bad) assert exp.get_status(bad)[0] == SmartSimStatus.STATUS_FAILED - assert exp.get_status(sleep)[0] == SmartSimStatus.STATUS_COMPLETED + assert exp.get_status(sleep_exp)[0] == SmartSimStatus.STATUS_COMPLETED summary_str = exp.summary(style="plain") print(summary_str) @@ -144,11 +144,11 @@ def test_summary(fileutils, test_dir, wlmutils): # the rows will be sleep, bad row = dict(zip(headers, rows[0])) row_1 = dict(zip(headers, rows[1])) - if row["Name"] != sleep.name: + if row["Name"] != sleep_exp.name: row_1, row = row, row_1 - assert sleep.name == row["Name"] - assert sleep.type == row["Entity-Type"] + assert sleep_exp.name == row["Name"] + assert sleep_exp.type == row["Entity-Type"] assert 0 == int(row["RunID"]) assert 0 == int(row["Returncode"]) diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index 8ddf02db5..2d623cdd1 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -196,11 +196,9 @@ def test_starting_entity(test_dir, wlmutils, entity, entity_2): step_settings = RunSettings("echo") step = MockStep("mock-step", test_dir, step_settings) test_launcher = wlmutils.get_test_launcher() - # if test_launcher == "dragon": - # step = DragonStep("mock-step", test_dir, step_settings) controller = Controller(test_launcher) controller._jobs.add_job(entity.name, job_id="1234", entity=entity) controller._jobs.move_to_completed(controller._jobs.jobs.get(entity.name)) with pytest.raises(SSUnsupportedError) as ex: controller._launch_step(step, entity=entity_2) - assert ex.value.args[0] == "SmartSim entities cannot have duplicate names." + assert ex.value.args[0] == "SmartSim entities cannot have duplicate names." From 4f667af7c0ea59085f4de3b15e756103e762adf3 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 25 Apr 2024 16:01:52 -0700 Subject: [PATCH 037/101] Add cooldown --- smartsim/_core/entrypoints/dragon.py | 16 +- .../_core/launcher/dragon/dragonBackend.py | 221 ++++++++++++------ .../_core/launcher/dragon/dragonConnector.py | 20 +- 3 files changed, 169 insertions(+), 88 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index e89a2e34e..0e26a07c3 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -38,6 +38,7 @@ import zmq import zmq.auth.thread +from smartsim._core.config import get_config from smartsim._core.launcher.dragon import dragonSockets from smartsim._core.launcher.dragon.dragonBackend import DragonBackend from smartsim._core.schemas import DragonBootstrapRequest, DragonBootstrapResponse @@ -84,8 +85,12 @@ def print_summary(network_interface: str, ip_address: str) -> None: def start_updater( backend: DragonBackend, updater: t.Optional[ContextThread] ) -> ContextThread: + # If the updater was started, check if it completed or died if updater is not None: updater.join(0.1) + # If it's alive, there is nothing to do + if updater.is_alive(): + return updater updater = ContextThread(name="DragonBackend", daemon=True, target=backend.update) updater.start() return updater @@ -127,7 +132,7 @@ def run( heartbeat_delay = ( dragon_backend.current_time - dragon_backend.last_heartbeat ) - if heartbeat_delay > 10.0: + if heartbeat_delay > 30.0: logger.debug( f"Restarting updater after {heartbeat_delay:.2f} " "seconds of inactivity." @@ -155,10 +160,15 @@ def main(args: argparse.Namespace) -> int: raise ValueError("Net interface could not be determined") dragon_head_address = f"tcp://{address}" + smartsim_config = get_config() if args.launching_address: zmq_context = zmq.Context() - zmq_context.setsockopt(zmq.SNDTIMEO, value=30000) - zmq_context.setsockopt(zmq.RCVTIMEO, value=30000) + zmq_context.setsockopt( + zmq.SNDTIMEO, value=smartsim_config.dragon_server_timeout + ) + zmq_context.setsockopt( + zmq.RCVTIMEO, value=smartsim_config.dragon_server_timeout + ) zmq_context.setsockopt(zmq.REQ_CORRELATE, 1) zmq_context.setsockopt(zmq.REQ_RELAXED, 1) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 76c35c2a9..97b20b377 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -43,6 +43,7 @@ # pylint: enable=import-error # isort: on +from ...._core.config import get_config from ...._core.schemas import ( DragonHandshakeRequest, DragonHandshakeResponse, @@ -70,14 +71,21 @@ @dataclass class ProcessGroupInfo: status: SmartSimStatus + """Status of step""" process_group: t.Optional[ProcessGroup] = None + """Internal Process Group object, None for finished or not started steps""" puids: t.Optional[t.List[t.Optional[int]]] = None # puids can be None + """List of Process UIDS belonging to the ProcessGroup""" return_codes: t.Optional[t.List[int]] = None + """List of return codes of completed processes""" hosts: t.List[str] = field(default_factory=list) + """List of hosts on which the Process Group """ redir_workers: t.Optional[ProcessGroup] = None + """Workers used to redirect stdout and stderr to file""" @property def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.Optional[t.List[int]]]: + """Information needed by SmartSim Launcher and Job Manager""" return (self.status, self.return_codes) @@ -110,34 +118,66 @@ class DragonBackend: def __init__(self, pid: int) -> None: self._pid = pid + """PID of dragon executable which launched this server""" self._group_infos: t.Dict[str, ProcessGroupInfo] = {} + """ProcessGroup execution state information""" self._queue_lock = RLock() - self._step_id = 0 - # hosts available for execution - # dictionary maps hostname to step_id of - # step being executed on it + """Lock that needs to be acquired to access internal queues""" + self._group_info_lock = RLock() + """Lock that needs to be acquired to access _group_infos""" + self._step_id: int = 0 + """Incremental ID to assign to new steps prior to execution""" + self._initialize_hosts() self._queued_steps: "collections.OrderedDict[str, DragonRunRequest]" = ( collections.OrderedDict() ) + """Steps waiting for execution""" self._stop_requests: t.Deque[DragonStopRequest] = collections.deque() + """Stop requests which have not been processed yet""" self._running_steps: t.List[str] = [] + """List of currently running steps""" self._completed_steps: t.List[str] = [] + """List of completed steps""" self._last_beat: float = 0.0 + """Time at which the last heartbeat was set""" self._heartbeat() self._last_update_time = self._last_beat + """Time at which the status update was printed the last time""" num_hosts = len(self._hosts) host_string = str(num_hosts) + (" hosts" if num_hosts > 1 else " host") self._shutdown_requested = False + """Whether the shutdown was requested to this server""" self._can_shutdown = False + """Whether the server can shut down""" self._frontend_shutdown: bool = False + """Whether the server frontend should shut down when the backend does""" + self._shutdown_initiation_time: t.Optional[float] = None + """The time at which the server initiated shutdown""" + smartsim_config = get_config() + self._cooldown_period = ( + smartsim_config.jm_interval * 2 if smartsim_config.telemetry_enabled else 5 + ) + """Time in seconds needed to server to complete shutdown""" logger.debug(f"{host_string} available for execution: {self._hosts}") + def _initialize_hosts(self) -> None: + with self._queue_lock: + self._hosts: t.List[str] = sorted( + Node(node).hostname for node in System().nodes + ) + """List of hosts available in allocation""" + self._free_hosts: t.Deque[str] = collections.deque(self._hosts) + """List of hosts on which steps can be launched""" + self._allocated_hosts: t.Dict[str, str] = {} + """List of hosts on which a step is already running""" + def __str__(self) -> str: return self.get_status_message() def get_status_message(self) -> str: - msg = [f"System hosts: {self._hosts}"] + msg = ["Dragon server backend update"] + msg.append(f"System hosts: {self._hosts}") msg.append(f"Free hosts: {list(self._free_hosts)}") msg.append(f"Allocated hosts: {self._allocated_hosts}") msg.append(f"Running steps: {self._running_steps}") @@ -148,33 +188,48 @@ def get_status_message(self) -> str: def _heartbeat(self) -> None: self._last_beat = self.current_time + @property + def _has_cooled_down(self) -> bool: + if self._shutdown_initiation_time is None: + self._shutdown_initiation_time = self.current_time + return ( + self.current_time - self._shutdown_initiation_time > self._cooldown_period + ) + @property def frontend_shutdown(self) -> bool: + """Whether the frontend will have to shutdown once the backend does + + If False, the frontend will wait for an external signal to stop. + """ return self._frontend_shutdown @property def last_heartbeat(self) -> float: + """Time (in seconds) at which the last heartbeat was set""" return self._last_beat @property def should_shutdown(self) -> bool: - return self._shutdown_requested and self._can_shutdown + """ "Whether the server should shut down + + A server should shut down if a DragonShutdownRequest was received + and it requested immediate shutdown, or if it did not request immediate + shutdown, but all jobs have been executed. + In both cases, a cooldown period may need to be waited before shutdown. + """ + if self._shutdown_requested and self._can_shutdown: + return self._has_cooled_down + return False @property def current_time(self) -> float: + """Current time for DragonBackend object, in seconds since the Epoch""" return time.time_ns() / 1e9 - def _initialize_hosts(self) -> None: - with self._queue_lock: - self._hosts: t.List[str] = sorted( - Node(node).hostname for node in System().nodes - ) - self._free_hosts: t.Deque[str] = collections.deque(self._hosts) - self._allocated_hosts: t.Dict[str, str] = {} - def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]: - """Check if request can be honored with resources - available in the allocation. + """Check if request can be honored with resources available in the allocation. + Currently only checks for total number of nodes, in the future it will also look at other constraints such as memory, accelerators, and so on. @@ -205,30 +260,6 @@ def _get_new_id(self) -> str: self._step_id += 1 return step_id - @functools.singledispatchmethod - # Deliberately suppressing errors so that overloads have the same signature - # pylint: disable-next=no-self-use - def process_request(self, request: DragonRequest) -> DragonResponse: - raise TypeError(f"Unsure how to process a `{type(request)}` request") - - @process_request.register - def _(self, request: DragonRunRequest) -> DragonRunResponse: - - step_id = self._get_new_id() - honorable, err = self._can_honor(request) - if not honorable: - self._group_infos[step_id] = ProcessGroupInfo( - status=SmartSimStatus.STATUS_FAILED, return_codes=[-1] - ) - return DragonRunResponse(step_id=step_id, error_message=err) - - with self._queue_lock: - self._queued_steps[step_id] = request - self._group_infos[step_id] = ProcessGroupInfo( - status=SmartSimStatus.STATUS_NEVER_STARTED - ) - return DragonRunResponse(step_id=step_id) - @staticmethod def _start_redirect_workers( global_policy: Policy, @@ -286,28 +317,30 @@ def _stop_steps(self) -> None: else: # Technically we could just terminate, but what if # the application intercepts that and ignores it? - proc_group = self._group_infos[step_id].process_group - if ( - proc_group is not None - and proc_group.status == DRG_RUNNING_STATUS - ): - try: - proc_group.kill() - except DragonProcessGroupError: + with self._group_info_lock: + proc_group = self._group_infos[step_id].process_group + if ( + proc_group is not None + and proc_group.status == DRG_RUNNING_STATUS + ): try: - proc_group.stop() + proc_group.kill() except DragonProcessGroupError: - logger.error("Process group already stopped") - redir_group = self._group_infos[step_id].redir_workers - if redir_group is not None: - try: - redir_group.join(0.1) - del redir_group - except Exception as e: - logger.error(e) + try: + proc_group.stop() + except DragonProcessGroupError: + logger.error("Process group already stopped") + redir_group = self._group_infos[step_id].redir_workers + if redir_group is not None: + try: + redir_group.join(0.1) + del redir_group + except Exception as e: + logger.error(e) - self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED - self._group_infos[step_id].return_codes = [-9] + with self._group_info_lock: + self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED + self._group_infos[step_id].return_codes = [-9] def _start_steps(self) -> None: started = [] @@ -352,13 +385,14 @@ def _start_steps(self) -> None: puids = None try: puids = grp.puids - self._group_infos[step_id] = ProcessGroupInfo( - process_group=grp, - puids=puids, - return_codes=[], - status=SmartSimStatus.STATUS_RUNNING, - hosts=hosts, - ) + with self._group_info_lock: + self._group_infos[step_id] = ProcessGroupInfo( + process_group=grp, + puids=puids, + return_codes=[], + status=SmartSimStatus.STATUS_RUNNING, + hosts=hosts, + ) self._running_steps.append(step_id) started.append(step_id) except Exception as e: @@ -373,7 +407,8 @@ def _start_steps(self) -> None: request.output_file, request.error_file, ) - self._group_infos[step_id].redir_workers = redir_grp + with self._group_info_lock: + self._group_infos[step_id].redir_workers = redir_grp except Exception as e: logger.error(e) @@ -385,12 +420,12 @@ def _start_steps(self) -> None: self._queued_steps.pop(step_id) except KeyError: logger.error( - "Tried to allocate the same step twice, step id {step_id}" + f"Tried to allocate the same step twice, step id {step_id}" ) def _refresh_statuses(self) -> None: terminated = [] - with self._queue_lock: + with self._queue_lock, self._group_info_lock: for step_id in self._running_steps: group_info = self._group_infos[step_id] grp = group_info.process_group @@ -428,6 +463,7 @@ def _refresh_statuses(self) -> None: if terminated: logger.debug(f"{terminated=}") + for step_id in terminated: self._running_steps.remove(step_id) self._completed_steps.append(step_id) @@ -444,12 +480,13 @@ def _refresh_statuses(self) -> None: group_info.redir_workers = None def _update_shutdown_status(self) -> None: - self._can_shutdown = all( - grp_info.status in TERMINAL_STATUSES - and grp_info.process_group is None - and grp_info.redir_workers is None - for grp_info in self._group_infos.values() - ) + with self._group_info_lock: + self._can_shutdown |= all( + grp_info.status in TERMINAL_STATUSES + and grp_info.process_group is None + and grp_info.redir_workers is None + for grp_info in self._group_infos.values() + ) def _should_print_status(self) -> bool: if self._last_beat - self._last_update_time > 10: @@ -458,26 +495,58 @@ def _should_print_status(self) -> bool: return False def update(self) -> None: + """Update internal data structures, queues, and job statuses""" logger.debug("Dragon Backend update thread started") while not self.should_shutdown: try: self._heartbeat() self._stop_steps() + self._heartbeat() self._start_steps() + self._heartbeat() self._refresh_statuses() + self._heartbeat() self._update_shutdown_status() + time.sleep(0.1) except Exception as e: logger.error(e) if self._should_print_status(): try: + self._heartbeat() logger.debug(str(self)) except ValueError as e: logger.error(e) logger.debug("Dragon Backend update thread stopping") + @functools.singledispatchmethod + # Deliberately suppressing errors so that overloads have the same signature + # pylint: disable-next=no-self-use + def process_request(self, request: DragonRequest) -> DragonResponse: + """Process an incoming DragonRequest""" + raise TypeError(f"Unsure how to process a `{type(request)}` request") + @process_request.register - def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: + def _(self, request: DragonRunRequest) -> DragonRunResponse: + step_id = self._get_new_id() + honorable, err = self._can_honor(request) + if not honorable: + with self._group_info_lock: + self._group_infos[step_id] = ProcessGroupInfo( + status=SmartSimStatus.STATUS_FAILED, return_codes=[-1] + ) + return DragonRunResponse(step_id=step_id, error_message=err) + with self._queue_lock: + self._queued_steps[step_id] = request + with self._group_info_lock: + self._group_infos[step_id] = ProcessGroupInfo( + status=SmartSimStatus.STATUS_NEVER_STARTED + ) + return DragonRunResponse(step_id=step_id) + + @process_request.register + def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: + with self._group_info_lock: return DragonUpdateStatusResponse( statuses={ step_id: self._group_infos[step_id].smartsim_info diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 2a201c76e..2dabc2e1e 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -34,6 +34,7 @@ import signal import subprocess import sys +import time import typing as t from pathlib import Path from threading import RLock @@ -416,17 +417,9 @@ def _dragon_cleanup( if server_process_pid and psutil.pid_exists(server_process_pid): os.kill(server_process_pid, signal.SIGKILL) finally: + time.sleep(5) print("Sending shutdown request is complete") - try: - if server_authenticator is not None and server_authenticator.is_alive(): - print("Shutting down ZMQ authenticator") - server_authenticator.stop() - except Exception: - print("Authenticator shutdown error") - finally: - print("Authenticator shutdown is complete") - if server_process_pid: try: _, retcode = os.waitpid(server_process_pid, 0) @@ -437,6 +430,15 @@ def _dragon_cleanup( except Exception as e: logger.debug(e) + try: + if server_authenticator is not None and server_authenticator.is_alive(): + print("Shutting down ZMQ authenticator") + server_authenticator.stop() + except Exception: + print("Authenticator shutdown error") + finally: + print("Authenticator shutdown is complete") + def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: dragon_server_path = get_config().dragon_server_path or os.path.join( From 2ca85de2686438adee0a2fe96e96cc7e22dbb2ba Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 26 Apr 2024 13:31:58 +0000 Subject: [PATCH 038/101] Enforce authenticator infinite timeout --- smartsim/_core/launcher/dragon/dragonConnector.py | 4 ++-- smartsim/_core/launcher/step/step.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 2dabc2e1e..a4da87ddf 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -195,7 +195,7 @@ def _connect_to_existing_server(self, path: Path) -> None: ) try: self._reset_timeout() - self._get_new_authenticator() + self._get_new_authenticator(-1) self._handshake(dragon_conf["address"]) except SmartSimError as e: logger.error(e) @@ -209,7 +209,7 @@ def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: config = get_config() connector_socket: t.Optional[zmq.Socket[t.Any]] = None self._reset_timeout(config.dragon_server_startup_timeout) - self._get_new_authenticator(config.dragon_server_startup_timeout) + self._get_new_authenticator(-1) connector_socket = dragonSockets.get_secure_socket(self._context, zmq.REP, True) logger.debug(f"Binding connector to {socket_addr}") connector_socket.bind(socket_addr) diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index 722330624..9d34fc52c 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -162,6 +162,8 @@ def _get_launch_cmd(self: _StepT) -> t.List[str]: entity_type = self.meta["entity_type"] status_dir = self.meta["status_dir"] + logger.debug("Encoding command "+" ".join(original_cmd_list)) + # encode the original cmd to avoid potential collisions and escaping # errors when passing it using CLI arguments to the indirect entrypoint encoded_cmd = encode_cmd(original_cmd_list) From 4b5bd94e6d311b64b149b319538131ca0a0fbe48 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 26 Apr 2024 16:23:52 +0000 Subject: [PATCH 039/101] Final comments being addressed --- smartsim/_core/entrypoints/dragon.py | 27 ++++++++++++------- .../_core/launcher/dragon/dragonBackend.py | 17 +++++++----- smartsim/_core/launcher/step/step.py | 2 +- smartsim/_core/schemas/dragonRequests.py | 8 +++--- smartsim/_core/utils/telemetry/telemetry.py | 8 ++++-- smartsim/_core/utils/telemetry/util.py | 2 +- tests/test_telemetry_monitor.py | 5 ++-- 7 files changed, 42 insertions(+), 27 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 0e26a07c3..c29ae4f6e 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -41,7 +41,11 @@ from smartsim._core.config import get_config from smartsim._core.launcher.dragon import dragonSockets from smartsim._core.launcher.dragon.dragonBackend import DragonBackend -from smartsim._core.schemas import DragonBootstrapRequest, DragonBootstrapResponse +from smartsim._core.schemas import ( + DragonBootstrapRequest, + DragonBootstrapResponse, + DragonShutdownRequest, +) from smartsim._core.utils.network import get_best_interface_and_address from smartsim.log import ContextThread, get_logger @@ -82,7 +86,7 @@ def print_summary(network_interface: str, ip_address: str) -> None: ) -def start_updater( +def restart_updater( backend: DragonBackend, updater: t.Optional[ContextThread] ) -> ContextThread: # If the updater was started, check if it completed or died @@ -107,12 +111,12 @@ def run( dragon_head_socket.bind(dragon_head_address) dragon_backend = DragonBackend(pid=dragon_pid) - backend_updater = start_updater(dragon_backend, None) + backend_updater = restart_updater(dragon_backend, None) server = dragonSockets.as_server(dragon_head_socket) logger.debug(f"Listening to {dragon_head_address}") - while not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): + while not dragon_backend.should_shutdown: try: req = server.recv() logger.debug(f"Received {type(req).__name__} {req}") @@ -127,20 +131,23 @@ def run( except zmq.Again: logger.error("Could not send response back to launcher.") - if not (dragon_backend.should_shutdown or SHUTDOWN_INITIATED): + # We can only check the heartbeat if the backend has not shut down + if not dragon_backend.should_shutdown: logger.debug(f"Listening to {dragon_head_address}") heartbeat_delay = ( dragon_backend.current_time - dragon_backend.last_heartbeat ) - if heartbeat_delay > 30.0: + if heartbeat_delay > 30.0 + float(dragon_backend.cooldown_period): logger.debug( f"Restarting updater after {heartbeat_delay:.2f} " "seconds of inactivity." ) - backend_updater = start_updater(dragon_backend, backend_updater) - else: - logger.info("Backend shutdown has been requested") - break + backend_updater = restart_updater(dragon_backend, backend_updater) + + if SHUTDOWN_INITIATED: + dragon_backend.process_request(DragonShutdownRequest()) + + logger.info("Backend shutdown has been requested") if backend_updater.is_alive(): backend_updater.join(1) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 97b20b377..2a877df01 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -156,7 +156,9 @@ def __init__(self, pid: int) -> None: """The time at which the server initiated shutdown""" smartsim_config = get_config() self._cooldown_period = ( - smartsim_config.jm_interval * 2 if smartsim_config.telemetry_enabled else 5 + smartsim_config.telemetry_frequency * 2 + 5 + if smartsim_config.telemetry_enabled + else 5 ) """Time in seconds needed to server to complete shutdown""" logger.debug(f"{host_string} available for execution: {self._hosts}") @@ -188,6 +190,10 @@ def get_status_message(self) -> str: def _heartbeat(self) -> None: self._last_beat = self.current_time + @property + def cooldown_period(self) -> int: + return self._cooldown_period + @property def _has_cooled_down(self) -> bool: if self._shutdown_initiation_time is None: @@ -303,6 +309,7 @@ def _start_redirect_workers( return grp_redir def _stop_steps(self) -> None: + self._heartbeat() with self._queue_lock: while len(self._stop_requests) > 0: request = self._stop_requests.popleft() @@ -343,6 +350,7 @@ def _stop_steps(self) -> None: self._group_infos[step_id].return_codes = [-9] def _start_steps(self) -> None: + self._heartbeat() started = [] with self._queue_lock: for step_id, request in self._queued_steps.items(): @@ -424,6 +432,7 @@ def _start_steps(self) -> None: ) def _refresh_statuses(self) -> None: + self._heartbeat() terminated = [] with self._queue_lock, self._group_info_lock: for step_id in self._running_steps: @@ -480,6 +489,7 @@ def _refresh_statuses(self) -> None: group_info.redir_workers = None def _update_shutdown_status(self) -> None: + self._heartbeat() with self._group_info_lock: self._can_shutdown |= all( grp_info.status in TERMINAL_STATUSES @@ -499,20 +509,15 @@ def update(self) -> None: logger.debug("Dragon Backend update thread started") while not self.should_shutdown: try: - self._heartbeat() self._stop_steps() - self._heartbeat() self._start_steps() - self._heartbeat() self._refresh_statuses() - self._heartbeat() self._update_shutdown_status() time.sleep(0.1) except Exception as e: logger.error(e) if self._should_print_status(): try: - self._heartbeat() logger.debug(str(self)) except ValueError as e: logger.error(e) diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index 9d34fc52c..1cbc4d9c6 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -162,7 +162,7 @@ def _get_launch_cmd(self: _StepT) -> t.List[str]: entity_type = self.meta["entity_type"] status_dir = self.meta["status_dir"] - logger.debug("Encoding command "+" ".join(original_cmd_list)) + logger.debug(f"Encoding command{' '.join(original_cmd_list)}") # encode the original cmd to avoid potential collisions and escaping # errors when passing it using CLI arguments to the indirect entrypoint diff --git a/smartsim/_core/schemas/dragonRequests.py b/smartsim/_core/schemas/dragonRequests.py index 07f20bc64..3e384f746 100644 --- a/smartsim/_core/schemas/dragonRequests.py +++ b/smartsim/_core/schemas/dragonRequests.py @@ -83,10 +83,8 @@ class DragonBootstrapRequest(DragonRequest): @request_registry.register("shutdown") class DragonShutdownRequest(DragonRequest): - # Whether the server should shut down immediately - # setting this to False means that the server will - # shut down when all jobs are terminated. immediate: bool = True - # Whether the frontend will have to shut down - # or wait for external termination + """Whether the server should shut down immediately, setting this to False means + that the server will shut down when all jobs are terminated.""" frontend_shutdown: bool = True + """Whether the frontend will have to shut down or wait for external termination""" diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py index 595fef335..fbf1fc5a0 100644 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -119,7 +119,7 @@ def tracked_jobs(self) -> t.Sequence[JobEntity]: def init_launcher(self, launcher: str) -> None: """Initialize the controller with a specific type of launcher. - SmartSim currently supports slurm, pbs(pro), lsf, + SmartSim currently supports Slurm, PBS(Pro), LSF, Dragon and local launching :param launcher: the name of the workload manager used by the experiment @@ -145,7 +145,11 @@ def init_job_manager(self) -> None: self.job_manager.start() def set_launcher(self, launcher_type: str) -> None: - """Set the launcher for the experiment""" + """Set the launcher for the experiment + + :param launcher_type: name of launcher type, e.g. 'slurm' + :type launcher_type: str + """ self.init_launcher(launcher_type) if self._launcher is None: diff --git a/smartsim/_core/utils/telemetry/util.py b/smartsim/_core/utils/telemetry/util.py index cf6459cf9..03f0b495e 100644 --- a/smartsim/_core/utils/telemetry/util.py +++ b/smartsim/_core/utils/telemetry/util.py @@ -69,7 +69,7 @@ def write_event( :param return_code: (optional) the return code of a completed task :type return_code: int|None""" - tgt_path = pathlib.Path(status_dir) / f"{event_type}.json" + tgt_path = status_dir / f"{event_type}.json" tgt_path.parent.mkdir(parents=True, exist_ok=True) try: diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py index 8c895cecf..693240f4f 100644 --- a/tests/test_telemetry_monitor.py +++ b/tests/test_telemetry_monitor.py @@ -84,7 +84,7 @@ def turn_on_tm(monkeypatch): yield -def write_stop_file(entity: JobEntity, test_dir: str, duration: int): +def write_stop_file(entity: JobEntity, test_dir: pathlib.Path, duration: int): time.sleep(duration) write_event( get_ts_ms(), @@ -583,7 +583,8 @@ def is_alive(self) -> bool: entity.status_dir = test_dir p = mp.Process( - target=write_stop_file, args=(entity, test_dir, (task_duration_ms / 1000)) + target=write_stop_file, + args=(entity, pathlib.Path(test_dir), (task_duration_ms / 1000)), ) frequency = 1000 From 81e1e1b308a0b36ee26d05a032311c39446b12e9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 26 Apr 2024 12:13:51 -0700 Subject: [PATCH 040/101] Fix args in dragon entrypoint --- smartsim/_core/entrypoints/dragon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 46dcca2ab..095582054 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -310,4 +310,4 @@ def main(args_: t.List[str]) -> int: if __name__ == "__main__": - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv[1:])) From 2c8a208c3c9013b8f2b6c8488776f300dbda8120 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sat, 27 Apr 2024 13:58:16 +0000 Subject: [PATCH 041/101] Fix handshake test post-merge --- tests/test_dragon_launcher.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 96ae6768b..ae741c472 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -130,13 +130,16 @@ def is_alive(self) -> bool: def mock_dragon_env(test_dir, *args, **kwargs): """Create a mock dragon environment that can talk to the launcher through ZMQ""" logger = logging.getLogger(__name__) + config = get_config() logging.basicConfig(level=logging.DEBUG) try: addr = "127.0.0.1" callback_port = kwargs["port"] head_port = find_free_port(start=callback_port + 1) context = zmq.Context.instance() - authenticator = get_authenticator(context) + context.setsockopt(zmq.SNDTIMEO, config.dragon_server_timeout) + context.setsockopt(zmq.RCVTIMEO, config.dragon_server_timeout) + authenticator = get_authenticator(context, -1) callback_socket = get_secure_socket(context, zmq.REQ, False) dragon_head_socket = get_secure_socket(context, zmq.REP, True) @@ -187,6 +190,7 @@ def mock_dragon_env(test_dir, *args, **kwargs): except Exception as ex: logger.info(f"exception occurred while configuring mock handshaker: {ex}") + raise ex from None def test_dragon_connect_bind_address(monkeypatch: pytest.MonkeyPatch, test_dir: str): @@ -349,6 +353,11 @@ def test_dragon_launcher_handshake(monkeypatch: pytest.MonkeyPatch, test_dir: st lambda: IFConfig("faux_interface", addr), ) + ctx.setattr( + "smartsim._core.launcher.dragon.dragonConnector._dragon_cleanup", + lambda server_socket, server_process_pid, server_authenticator: server_authenticator.stop(), + ) + # start up a faux dragon env that knows how to do the handshake process # but uses secure sockets for all communication. mock_dragon = mp.Process( @@ -369,5 +378,5 @@ def fn(*args, **kwargs): # connect executes the complete handshake and raises an exception if comms fails connector.connect_to_dragon() finally: - connector.cleanup(False) + connector.cleanup() ... From a5d39dbee5805db6db8085e2d29b8076cb6c98ee Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sun, 28 Apr 2024 14:20:59 +0000 Subject: [PATCH 042/101] Fix issue with inactive PUIDS, create tables --- conftest.py | 8 +- smartsim/_core/entrypoints/dragon.py | 29 ++- .../_core/launcher/dragon/dragonBackend.py | 216 +++++++++++------- .../_core/launcher/dragon/dragonConnector.py | 17 +- .../_core/launcher/dragon/dragonSockets.py | 2 +- smartsim/_core/schemas/utils.py | 4 +- 6 files changed, 173 insertions(+), 103 deletions(-) diff --git a/conftest.py b/conftest.py index b5d6a39da..55753634d 100644 --- a/conftest.py +++ b/conftest.py @@ -166,9 +166,9 @@ def pytest_sessionfinish( raise else: break - - # kill all spawned processes - kill_all_test_spawned_processes() + else: + # kill all spawned processes + kill_all_test_spawned_processes() def build_mpi_app() -> t.Optional[pathlib.Path]: @@ -773,7 +773,7 @@ def setup_test_colo( db_args["port"] = port db_args["ifname"] = "lo" if db_type == "uds" and colo_model_name is not None: - tmp_dir = tempfile.gettempdir() + tmp_dir = "/var/tmp" socket_suffix = str(uuid.uuid4())[:7] socket_name = f"{colo_model_name}_{socket_suffix}.socket" db_args["unix_socket"] = os.path.join(tmp_dir, socket_name) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 095582054..9441bb21b 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -113,6 +113,23 @@ def restart_updater( return updater +def is_updater_healthy(backend: DragonBackend) -> bool: + + heartbeat_delay = backend.current_time - backend.last_heartbeat + if heartbeat_delay > 30.0 + float(backend.cooldown_period): + logger.debug( + f"Updater inactive for {heartbeat_delay:.2f} seconds, will request restart." + ) + return False + return True + + +def updater_fallback(backend: DragonBackend, updater: ContextThread) -> ContextThread: + if is_updater_healthy(backend): + return updater + return restart_updater(backend, updater) + + # pylint: disable-next=too-many-statements def run( zmq_context: "zmq.Context[t.Any]", @@ -134,6 +151,7 @@ def run( req = server.recv() logger.debug(f"Received {type(req).__name__} {req}") except zmq.Again: + backend_updater = updater_fallback(dragon_backend, backend_updater) continue resp = dragon_backend.process_request(req) @@ -143,19 +161,12 @@ def run( server.send(resp) except zmq.Again: logger.error("Could not send response back to launcher.") + backend_updater = updater_fallback(dragon_backend, backend_updater) # We can only check the heartbeat if the backend has not shut down if not dragon_backend.should_shutdown: logger.debug(f"Listening to {dragon_head_address}") - heartbeat_delay = ( - dragon_backend.current_time - dragon_backend.last_heartbeat - ) - if heartbeat_delay > 30.0 + float(dragon_backend.cooldown_period): - logger.debug( - f"Restarting updater after {heartbeat_delay:.2f} " - "seconds of inactivity." - ) - backend_updater = restart_updater(dragon_backend, backend_updater) + backend_updater = updater_fallback(dragon_backend, backend_updater) if SHUTDOWN_INITIATED: dragon_backend.process_request(DragonShutdownRequest()) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 2a877df01..f07f27bd3 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -28,6 +28,8 @@ import time import typing as t from dataclasses import dataclass, field + +from tabulate import tabulate from threading import RLock # pylint: disable=import-error @@ -88,6 +90,21 @@ def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.Optional[t.List[int]]]: """Information needed by SmartSim Launcher and Job Manager""" return (self.status, self.return_codes) + def __str__(self) -> str: + if self.process_group is not None and self.redir_workers is not None: + msg = [f"Active Group ({self.status})"] + if self.puids is not None: + msg.append(f"Number processes: {len(self.puids)}") + else: + msg = [f"Inactive Group ({self.status})"] + + if self.hosts is not None: + msg.append(f"Hosts: {','.join(self.hosts)}") + if self.return_codes is not None: + msg.append(f"{self.return_codes}") + + return ", ".join(msg) + # Thanks to Colin Wahl from HPE HPC Dragon Team def redir_worker(io_conn: Connection, file_path: str) -> None: @@ -98,6 +115,8 @@ def redir_worker(io_conn: Connection, file_path: str) -> None: :param file_path: path to file to write to :type file_path: str """ + while io_conn is None or not io_conn.readable: + time.sleep(0.1) try: with open(file_path, "a", encoding="utf-8") as file_to_write: while True: @@ -105,6 +124,8 @@ def redir_worker(io_conn: Connection, file_path: str) -> None: print(output, flush=True, file=file_to_write, end="") except EOFError: pass + except Exception as e: + print(e) finally: io_conn.close() @@ -123,8 +144,6 @@ def __init__(self, pid: int) -> None: """ProcessGroup execution state information""" self._queue_lock = RLock() """Lock that needs to be acquired to access internal queues""" - self._group_info_lock = RLock() - """Lock that needs to be acquired to access _group_infos""" self._step_id: int = 0 """Incremental ID to assign to new steps prior to execution""" @@ -163,6 +182,49 @@ def __init__(self, pid: int) -> None: """Time in seconds needed to server to complete shutdown""" logger.debug(f"{host_string} available for execution: {self._hosts}") + @staticmethod + def _proc_group_info_table_line(step_id: str, proc_group_info: ProcessGroupInfo) -> t.List[str]: + table_line = [step_id, f"{str(proc_group_info.status)}"] + + if proc_group_info.hosts is not None: + table_line.append(f"{','.join(proc_group_info.hosts)}") + else: + table_line.append("") + + if proc_group_info.return_codes is not None: + table_line.append(f"{','.join(str(ret) for ret in proc_group_info.return_codes)}") + else: + table_line.append("") + + if proc_group_info.puids is not None: + table_line.append(f"{len(proc_group_info.puids)}") + else: + table_line.append("") + + return table_line + + @property + def step_table(self) -> str: + headers = ["Step", "Status", "Hosts", "Return codes", "PUIDS"] + values = [] + + with self._queue_lock: + for step, group_info in self._group_infos.items(): + values.append(DragonBackend._proc_group_info_table_line(step, group_info)) + + return tabulate(values, headers, disable_numparse=True, tablefmt="github") + + @property + def host_table(self) -> str: + headers = ["Host", "State"] + values = [] + + with self._queue_lock: + for host in self._hosts: + values.append([host, "Free" if host in self._free_hosts else "Busy"]) + + return tabulate(values, headers, disable_numparse=True, tablefmt="githbu") + def _initialize_hosts(self) -> None: with self._queue_lock: self._hosts: t.List[str] = sorted( @@ -179,12 +241,8 @@ def __str__(self) -> str: def get_status_message(self) -> str: msg = ["Dragon server backend update"] - msg.append(f"System hosts: {self._hosts}") - msg.append(f"Free hosts: {list(self._free_hosts)}") - msg.append(f"Allocated hosts: {self._allocated_hosts}") - msg.append(f"Running steps: {self._running_steps}") - msg.append(f"Group infos: {self._group_infos}") - msg.append(f"There are {len(self._queued_steps)} queued steps") + msg.append(self.host_table) + msg.append(self.step_table) return "\n".join(msg) def _heartbeat(self) -> None: @@ -197,6 +255,7 @@ def cooldown_period(self) -> int: @property def _has_cooled_down(self) -> bool: if self._shutdown_initiation_time is None: + logger.debug(f"Starting cooldown period of {self._cooldown_period} seconds") self._shutdown_initiation_time = self.current_time return ( self.current_time - self._shutdown_initiation_time > self._cooldown_period @@ -267,14 +326,14 @@ def _get_new_id(self) -> str: return step_id @staticmethod - def _start_redirect_workers( + def _create_redirect_workers( global_policy: Policy, policies: t.List[Policy], puids: t.List[int], out_file: t.Optional[str], err_file: t.Optional[str], ) -> ProcessGroup: - grp_redir = ProcessGroup(restart=False, policy=global_policy) + grp_redir = ProcessGroup(restart=False, policy=global_policy, pmi_enabled=False) for pol, puid in zip(policies, puids): proc = Process(None, ident=puid) if out_file: @@ -297,14 +356,6 @@ def _start_redirect_workers( policy=pol, ), ) - try: - grp_redir.init() - time.sleep(0.1) - grp_redir.start() - except Exception as e: - raise IOError( - f"Could not redirect stdout and stderr for PUIDS {puids}" - ) from e return grp_redir @@ -324,35 +375,33 @@ def _stop_steps(self) -> None: else: # Technically we could just terminate, but what if # the application intercepts that and ignores it? - with self._group_info_lock: - proc_group = self._group_infos[step_id].process_group - if ( - proc_group is not None - and proc_group.status == DRG_RUNNING_STATUS - ): + proc_group = self._group_infos[step_id].process_group + if ( + proc_group is not None + and proc_group.status == DRG_RUNNING_STATUS + ): + try: + proc_group.kill() + except DragonProcessGroupError: try: - proc_group.kill() + proc_group.stop() except DragonProcessGroupError: - try: - proc_group.stop() - except DragonProcessGroupError: - logger.error("Process group already stopped") - redir_group = self._group_infos[step_id].redir_workers - if redir_group is not None: - try: - redir_group.join(0.1) - del redir_group - except Exception as e: - logger.error(e) + logger.error("Process group already stopped") + redir_group = self._group_infos[step_id].redir_workers + if redir_group is not None: + try: + redir_group.join(0.1) + redir_group = None + except Exception as e: + logger.error(e) - with self._group_info_lock: - self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED - self._group_infos[step_id].return_codes = [-9] + self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED + self._group_infos[step_id].return_codes = [-9] def _start_steps(self) -> None: self._heartbeat() - started = [] with self._queue_lock: + started = [] for step_id, request in self._queued_steps.items(): hosts = self._allocate_step(step_id, self._queued_steps[step_id]) if not hosts: @@ -387,38 +436,48 @@ def _start_steps(self) -> None: try: grp.init() grp.start() + grp_status = SmartSimStatus.STATUS_RUNNING except Exception as e: logger.error(e) + grp_status = SmartSimStatus.STATUS_FAILED puids = None try: - puids = grp.puids - with self._group_info_lock: - self._group_infos[step_id] = ProcessGroupInfo( - process_group=grp, - puids=puids, - return_codes=[], - status=SmartSimStatus.STATUS_RUNNING, - hosts=hosts, - ) + puids = list( + set(grp.puids + [puid for puid, retcode in grp.inactive_puids]) + ) + self._group_infos[step_id] = ProcessGroupInfo( + process_group=grp, + puids=puids, + return_codes=[], + status=grp_status, + hosts=hosts, + ) self._running_steps.append(step_id) started.append(step_id) except Exception as e: logger.error(e) - if puids is not None: + if ( + puids is not None + and len(puids) == len(policies) + and grp_status == SmartSimStatus.STATUS_RUNNING + ): + redir_grp = DragonBackend._create_redirect_workers( + global_policy, + policies, + puids, + request.output_file, + request.error_file, + ) try: - redir_grp = DragonBackend._start_redirect_workers( - global_policy, - policies, - puids, - request.output_file, - request.error_file, - ) - with self._group_info_lock: - self._group_infos[step_id].redir_workers = redir_grp + redir_grp.init() + redir_grp.start() except Exception as e: - logger.error(e) + raise IOError( + f"Could not redirect stdout and stderr for PUIDS {puids}" + ) from e + self._group_infos[step_id].redir_workers = redir_grp if started: logger.debug(f"{started=}") @@ -430,11 +489,13 @@ def _start_steps(self) -> None: logger.error( f"Tried to allocate the same step twice, step id {step_id}" ) + except Exception as e: + logger.error(e) def _refresh_statuses(self) -> None: self._heartbeat() - terminated = [] - with self._queue_lock, self._group_info_lock: + with self._queue_lock: + terminated = [] for step_id in self._running_steps: group_info = self._group_infos[step_id] grp = group_info.process_group @@ -483,14 +544,14 @@ def _refresh_statuses(self) -> None: try: self._allocated_hosts.pop(host) except KeyError: - logger.error(f"Tried to free same host twice: {host}") + logger.error(f"Tried to free a non-allocated host: {host}") self._free_hosts.append(host) group_info.process_group = None group_info.redir_workers = None def _update_shutdown_status(self) -> None: self._heartbeat() - with self._group_info_lock: + with self._queue_lock: self._can_shutdown |= all( grp_info.status in TERMINAL_STATUSES and grp_info.process_group is None @@ -499,8 +560,8 @@ def _update_shutdown_status(self) -> None: ) def _should_print_status(self) -> bool: - if self._last_beat - self._last_update_time > 10: - self._last_update_time = self._last_beat + if self.current_time - self._last_update_time > 10: + self._last_update_time = self.current_time return True return False @@ -533,25 +594,22 @@ def process_request(self, request: DragonRequest) -> DragonResponse: @process_request.register def _(self, request: DragonRunRequest) -> DragonRunResponse: step_id = self._get_new_id() - honorable, err = self._can_honor(request) - if not honorable: - with self._group_info_lock: + with self._queue_lock: + honorable, err = self._can_honor(request) + if not honorable: self._group_infos[step_id] = ProcessGroupInfo( status=SmartSimStatus.STATUS_FAILED, return_codes=[-1] ) - return DragonRunResponse(step_id=step_id, error_message=err) - - with self._queue_lock: - self._queued_steps[step_id] = request - with self._group_info_lock: - self._group_infos[step_id] = ProcessGroupInfo( - status=SmartSimStatus.STATUS_NEVER_STARTED - ) - return DragonRunResponse(step_id=step_id) + else: + self._queued_steps[step_id] = request + self._group_infos[step_id] = ProcessGroupInfo( + status=SmartSimStatus.STATUS_NEVER_STARTED + ) + return DragonRunResponse(step_id=step_id, error_message=err) @process_request.register def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: - with self._group_info_lock: + with self._queue_lock: return DragonUpdateStatusResponse( statuses={ step_id: self._group_infos[step_id].smartsim_info diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index a4da87ddf..b1fa0d469 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -31,7 +31,6 @@ import itertools import json import os -import signal import subprocess import sys import time @@ -372,13 +371,16 @@ def _parse_launched_dragon_server_info_from_files( @staticmethod def _send_req_with_socket( - socket: zmq.Socket[t.Any], request: DragonRequest, flags: int = 0 + socket: zmq.Socket[t.Any], + request: DragonRequest, + send_flags: int = 0, + recv_flags: int = 0, ) -> DragonResponse: client = dragonSockets.as_client(socket) with DRG_LOCK: logger.debug(f"Sending {type(request).__name__}: {request}") - client.send(request, flags) - response = client.recv() + client.send(request, send_flags) + response = client.recv(flags=recv_flags) logger.debug(f"Received {type(response).__name__}: {response}") return response @@ -408,19 +410,18 @@ def _dragon_cleanup( print("Sending shutdown request to dragon environment") # pylint: disable-next=protected-access DragonConnector._send_req_with_socket( - server_socket, DragonShutdownRequest() + server_socket, DragonShutdownRequest(), recv_flags=zmq.NOBLOCK ) except (zmq.error.ZMQError, zmq.Again) as e: # Can't use the logger as I/O file may be closed print("Could not send shutdown request to dragon server") print(f"ZMQ error: {e}", flush=True) - if server_process_pid and psutil.pid_exists(server_process_pid): - os.kill(server_process_pid, signal.SIGKILL) + finally: time.sleep(5) print("Sending shutdown request is complete") - if server_process_pid: + if server_process_pid and psutil.pid_exists(server_process_pid): try: _, retcode = os.waitpid(server_process_pid, 0) print( diff --git a/smartsim/_core/launcher/dragon/dragonSockets.py b/smartsim/_core/launcher/dragon/dragonSockets.py index ab4ecd00c..ee6e644a0 100644 --- a/smartsim/_core/launcher/dragon/dragonSockets.py +++ b/smartsim/_core/launcher/dragon/dragonSockets.py @@ -139,7 +139,7 @@ def get_authenticator( server_keys, client_keys = key_manager.get_keys() logger.debug(f"Applying keys to authenticator: {server_keys}, {client_keys}") - AUTHENTICATOR = zmq.auth.thread.ThreadAuthenticator(context) + AUTHENTICATOR = zmq.auth.thread.ThreadAuthenticator(context, log=logger) ctx_sndtimeo = context.getsockopt(zmq.SNDTIMEO) ctx_rcvtimeo = context.getsockopt(zmq.RCVTIMEO) diff --git a/smartsim/_core/schemas/utils.py b/smartsim/_core/schemas/utils.py index 838a9b0db..9cb36bcf5 100644 --- a/smartsim/_core/schemas/utils.py +++ b/smartsim/_core/schemas/utils.py @@ -120,5 +120,5 @@ class SocketSchemaTranslator(t.Generic[_SendT, _RecvT]): def send(self, schema: _SendT, flags: int = 0) -> None: self.socket.send_string(self._send_registry.to_string(schema), flags) - def recv(self) -> _RecvT: - return self._recv_registry.from_string(self.socket.recv_string()) + def recv(self, flags: int = 0) -> _RecvT: + return self._recv_registry.from_string(self.socket.recv_string(flags)) From 9d015c14ab6a74f728e187e7e1cb1a72b61e180e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sun, 28 Apr 2024 15:02:03 +0000 Subject: [PATCH 043/101] Lint --- smartsim/_core/launcher/dragon/dragonBackend.py | 14 ++++++++++---- tests/test_dragon_launcher.py | 4 ++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index f07f27bd3..5f3c57484 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -28,9 +28,9 @@ import time import typing as t from dataclasses import dataclass, field +from threading import RLock from tabulate import tabulate -from threading import RLock # pylint: disable=import-error # isort: off @@ -183,7 +183,9 @@ def __init__(self, pid: int) -> None: logger.debug(f"{host_string} available for execution: {self._hosts}") @staticmethod - def _proc_group_info_table_line(step_id: str, proc_group_info: ProcessGroupInfo) -> t.List[str]: + def _proc_group_info_table_line( + step_id: str, proc_group_info: ProcessGroupInfo + ) -> t.List[str]: table_line = [step_id, f"{str(proc_group_info.status)}"] if proc_group_info.hosts is not None: @@ -192,7 +194,9 @@ def _proc_group_info_table_line(step_id: str, proc_group_info: ProcessGroupInfo) table_line.append("") if proc_group_info.return_codes is not None: - table_line.append(f"{','.join(str(ret) for ret in proc_group_info.return_codes)}") + table_line.append( + f"{','.join(str(ret) for ret in proc_group_info.return_codes)}" + ) else: table_line.append("") @@ -210,7 +214,9 @@ def step_table(self) -> str: with self._queue_lock: for step, group_info in self._group_infos.items(): - values.append(DragonBackend._proc_group_info_table_line(step, group_info)) + values.append( + DragonBackend._proc_group_info_table_line(step, group_info) + ) return tabulate(values, headers, disable_numparse=True, tablefmt="github") diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index ae741c472..eb3e0a250 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -85,7 +85,7 @@ def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: def bind(self, addr: str) -> None: self._bind_address = addr - def recv_string(self) -> str: + def recv_string(self, flags: int) -> str: dbr = DragonBootstrapRequest(address=self._bind_address) return f"bootstrap|{dbr.json()}" @@ -105,7 +105,7 @@ def bind_address(self) -> str: class MockAuthenticator: - def __init__(self, context: zmq.Context) -> None: + def __init__(self, context: zmq.Context, log: t.Any) -> None: self.num_starts: int = 0 self.num_stops: int = 0 self.num_configure_curves: int = 0 From 68f4d86288c130aaeb5c61996386ae752907f795 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sun, 28 Apr 2024 17:15:52 +0000 Subject: [PATCH 044/101] Revert tmp_dir --- conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 55753634d..5cba30b2e 100644 --- a/conftest.py +++ b/conftest.py @@ -773,7 +773,7 @@ def setup_test_colo( db_args["port"] = port db_args["ifname"] = "lo" if db_type == "uds" and colo_model_name is not None: - tmp_dir = "/var/tmp" + tmp_dir = tempfile.gettempdir() socket_suffix = str(uuid.uuid4())[:7] socket_name = f"{colo_model_name}_{socket_suffix}.socket" db_args["unix_socket"] = os.path.join(tmp_dir, socket_name) From 770e11c45b043e8ca77b4aee16bbc21edc120c54 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sun, 28 Apr 2024 22:36:12 +0000 Subject: [PATCH 045/101] Fix table formatting --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 5f3c57484..ca28808f0 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -229,7 +229,7 @@ def host_table(self) -> str: for host in self._hosts: values.append([host, "Free" if host in self._free_hosts else "Busy"]) - return tabulate(values, headers, disable_numparse=True, tablefmt="githbu") + return tabulate(values, headers, disable_numparse=True, tablefmt="github") def _initialize_hosts(self) -> None: with self._queue_lock: From 1823c2c2cce4f007be9622ee497a3e86c391a098 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 30 Apr 2024 21:20:14 +0200 Subject: [PATCH 046/101] Add dragon backend unit tests, refactor dragon backend --- .../_core/launcher/dragon/dragonBackend.py | 92 +++-- tests/test_dragon_backend.py | 344 ++++++++++++++++++ 2 files changed, 397 insertions(+), 39 deletions(-) create mode 100644 tests/test_dragon_backend.py diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index ca28808f0..4ea098559 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -34,14 +34,20 @@ # pylint: disable=import-error # isort: off -from dragon.infrastructure.connection import Connection -from dragon.infrastructure.policy import Policy -from dragon.native.process import Process, ProcessTemplate, Popen -from dragon.native.process_group import ( - ProcessGroup, - DragonProcessGroupError, -) -from dragon.native.machine import System, Node +import dragon.infrastructure.connection as dragon_connection +import dragon.infrastructure.policy as dragon_policy +import dragon.native.process as dragon_process +import dragon.native.process_group as dragon_process_group +import dragon.native.machine as dragon_machine + +# from dragon.infrastructure.connection import Connection +# from dragon.infrastructure.policy import Policy +# from dragon.native.process import Process, ProcessTemplate, Popen +# from dragon.native.process_group import ( +# ProcessGroup, +# DragonProcessGroupError, +# ) +# from dragon.native.machine import System, Node # pylint: enable=import-error # isort: on @@ -74,7 +80,7 @@ class ProcessGroupInfo: status: SmartSimStatus """Status of step""" - process_group: t.Optional[ProcessGroup] = None + process_group: t.Optional[dragon_process_group.ProcessGroup] = None """Internal Process Group object, None for finished or not started steps""" puids: t.Optional[t.List[t.Optional[int]]] = None # puids can be None """List of Process UIDS belonging to the ProcessGroup""" @@ -82,7 +88,7 @@ class ProcessGroupInfo: """List of return codes of completed processes""" hosts: t.List[str] = field(default_factory=list) """List of hosts on which the Process Group """ - redir_workers: t.Optional[ProcessGroup] = None + redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None """Workers used to redirect stdout and stderr to file""" @property @@ -107,11 +113,11 @@ def __str__(self) -> str: # Thanks to Colin Wahl from HPE HPC Dragon Team -def redir_worker(io_conn: Connection, file_path: str) -> None: +def redir_worker(io_conn: dragon_connection.Connection, file_path: str) -> None: """Read stdout/stderr from the Dragon connection. :param io_conn: Dragon connection to stdout or stderr - :type io_conn: Connection + :type io_conn: dragon.infrastructure.connection.Connection :param file_path: path to file to write to :type file_path: str """ @@ -164,7 +170,7 @@ def __init__(self, pid: int) -> None: self._last_update_time = self._last_beat """Time at which the status update was printed the last time""" num_hosts = len(self._hosts) - host_string = str(num_hosts) + (" hosts" if num_hosts > 1 else " host") + host_string = str(num_hosts) + (" hosts" if num_hosts != 1 else " host") self._shutdown_requested = False """Whether the shutdown was requested to this server""" self._can_shutdown = False @@ -234,7 +240,8 @@ def host_table(self) -> str: def _initialize_hosts(self) -> None: with self._queue_lock: self._hosts: t.List[str] = sorted( - Node(node).hostname for node in System().nodes + dragon_machine.Node(node).hostname + for node in dragon_machine.System().nodes ) """List of hosts available in allocation""" self._free_hosts: t.Deque[str] = collections.deque(self._hosts) @@ -282,7 +289,7 @@ def last_heartbeat(self) -> float: @property def should_shutdown(self) -> bool: - """ "Whether the server should shut down + """Whether the server should shut down A server should shut down if a DragonShutdownRequest was received and it requested immediate shutdown, or if it did not request immediate @@ -333,32 +340,34 @@ def _get_new_id(self) -> str: @staticmethod def _create_redirect_workers( - global_policy: Policy, - policies: t.List[Policy], + global_policy: dragon_policy.Policy, + policies: t.List[dragon_policy.Policy], puids: t.List[int], out_file: t.Optional[str], err_file: t.Optional[str], - ) -> ProcessGroup: - grp_redir = ProcessGroup(restart=False, policy=global_policy, pmi_enabled=False) + ) -> dragon_process_group.ProcessGroup: + grp_redir = dragon_process_group.ProcessGroup( + restart=False, policy=global_policy, pmi_enabled=False + ) for pol, puid in zip(policies, puids): - proc = Process(None, ident=puid) + proc = dragon_process.Process(None, ident=puid) if out_file: grp_redir.add_process( nproc=1, - template=ProcessTemplate( + template=dragon_process.ProcessTemplate( target=redir_worker, args=(proc.stdout_conn, out_file), - stdout=Popen.DEVNULL, + stdout=dragon_process.Popen.DEVNULL, policy=pol, ), ) if err_file: grp_redir.add_process( nproc=1, - template=ProcessTemplate( + template=dragon_process.ProcessTemplate( target=redir_worker, args=(proc.stderr_conn, err_file), - stdout=Popen.DEVNULL, + stdout=dragon_process.Popen.DEVNULL, policy=pol, ), ) @@ -388,10 +397,10 @@ def _stop_steps(self) -> None: ): try: proc_group.kill() - except DragonProcessGroupError: + except dragon_process_group.DragonProcessGroupError: try: proc_group.stop() - except DragonProcessGroupError: + except dragon_process_group.DragonProcessGroupError: logger.error("Process group already stopped") redir_group = self._group_infos[step_id].redir_workers if redir_group is not None: @@ -415,26 +424,28 @@ def _start_steps(self) -> None: logger.debug(f"Step id {step_id} allocated on {hosts}") - global_policy = Policy( - placement=Policy.Placement.HOST_NAME, host_name=hosts[0] + global_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=hosts[0], ) - grp = ProcessGroup( + grp = dragon_process_group.ProcessGroup( restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy ) policies = [] for node_name in hosts: - local_policy = Policy( - placement=Policy.Placement.HOST_NAME, host_name=node_name + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=node_name, ) policies.extend([local_policy] * request.tasks_per_node) - tmp_proc = ProcessTemplate( + tmp_proc = dragon_process.ProcessTemplate( target=request.exe, args=request.exe_args, cwd=request.path, env={**request.current_env, **request.env}, - stdout=Popen.PIPE, - stderr=Popen.PIPE, + stdout=dragon_process.Popen.PIPE, + stderr=dragon_process.Popen.PIPE, policy=local_policy, ) grp.add_process(nproc=request.tasks_per_node, template=tmp_proc) @@ -518,7 +529,7 @@ def _refresh_statuses(self) -> None: ): try: group_info.return_codes = [ - Process(None, ident=puid).returncode + dragon_process.Process(None, ident=puid).returncode for puid in puids ] except (ValueError, TypeError) as e: @@ -571,15 +582,18 @@ def _should_print_status(self) -> bool: return True return False + def _update(self) -> None: + self._stop_steps() + self._start_steps() + self._refresh_statuses() + self._update_shutdown_status() + def update(self) -> None: """Update internal data structures, queues, and job statuses""" logger.debug("Dragon Backend update thread started") while not self.should_shutdown: try: - self._stop_steps() - self._start_steps() - self._refresh_statuses() - self._update_shutdown_status() + self._update() time.sleep(0.1) except Exception as e: logger.error(e) diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py new file mode 100644 index 000000000..cd6e85925 --- /dev/null +++ b/tests/test_dragon_backend.py @@ -0,0 +1,344 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import collections +import sys +import time +from unittest.mock import MagicMock + +import pytest + +from smartsim._core.config import CONFIG +from smartsim._core.schemas.dragonRequests import * +from smartsim._core.schemas.dragonResponses import * +from smartsim._core.utils.helpers import create_short_id_str + +if t.TYPE_CHECKING: + from smartsim._core.launcher.dragon.dragonBackend import ( + DragonBackend, + ProcessGroupInfo, + ) + + +class NodeMock(MagicMock): + @property + def hostname(self) -> str: + return create_short_id_str() + + +def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": + + process_mock = MagicMock(returncode=0) + process_module_mock = MagicMock() + process_module_mock.Process = process_mock + node_mock = NodeMock() + system_mock = MagicMock(nodes=["node1", "node2", "node3"]) + monkeypatch.setitem( + sys.modules, + "dragon", + MagicMock( + **{ + "native.machine.Node.return_value": node_mock, + "native.machine.System.return_value": system_mock, + } + ), + ) + monkeypatch.setitem( + sys.modules, + "dragon.infrastructure.connection", + MagicMock(), + ) + monkeypatch.setitem( + sys.modules, + "dragon.infrastructure.policy", + MagicMock(**{"Policy.return_value": MagicMock()}), + ) + monkeypatch.setitem(sys.modules, "dragon.native.process", process_module_mock) + monkeypatch.setitem(sys.modules, "dragon.native.process_group", MagicMock()) + monkeypatch.setitem( + sys.modules, + "dragon.native.machine", + MagicMock( + **{"System.return_value": system_mock, "Node.return_value": node_mock} + ), + ) + from smartsim._core.launcher.dragon.dragonBackend import DragonBackend + + dragon_backend = DragonBackend(pid=99999) + monkeypatch.setattr(dragon_backend, "_hosts", ["node1", "node2", "node3"]) + monkeypatch.setattr( + dragon_backend, "_free_hosts", collections.deque(dragon_backend._hosts) + ) + + return dragon_backend + + +def set_mock_group_infos( + monkeypatch: pytest.MonkeyPatch, dragon_backend: "DragonBackend" +) -> t.Dict[str, "ProcessGroupInfo"]: + dragon_mock = MagicMock() + process_mock = MagicMock() + process_mock.configure_mock(**{"returncode": 0}) + dragon_mock.configure_mock(**{"native.process.Process.return_value": process_mock}) + monkeypatch.setitem(sys.modules, "dragon", dragon_mock) + from smartsim._core.launcher.dragon.dragonBackend import ProcessGroupInfo + + running_group = MagicMock(status="Running") + error_group = MagicMock(status="Error") + hosts = dragon_backend._hosts + + group_infos = { + "abc123-1": ProcessGroupInfo( + SmartSimStatus.STATUS_RUNNING, + running_group, + [123], + [], + hosts[0:1], + MagicMock(), + ), + "del999-2": ProcessGroupInfo( + SmartSimStatus.STATUS_CANCELLED, + error_group, + [124], + [-9], + hosts[1:2], + MagicMock(), + ), + "c1091vz-3": ProcessGroupInfo( + SmartSimStatus.STATUS_COMPLETED, + MagicMock(), + [125, 126], + [0], + hosts[1:3], + MagicMock(), + ), + "0ghjk1-4": ProcessGroupInfo( + SmartSimStatus.STATUS_FAILED, + error_group, + [127], + [-1], + hosts[2:3], + MagicMock(), + ), + "ljace0-5": ProcessGroupInfo( + SmartSimStatus.STATUS_NEVER_STARTED, None, [], [], [], None + ), + } + + monkeypatch.setattr(dragon_backend, "_group_infos", group_infos) + monkeypatch.setattr(dragon_backend, "_free_hosts", collections.deque(hosts[1:3])) + monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: "abc123-1"}) + monkeypatch.setattr(dragon_backend, "_running_steps", ["abc123-1"]) + + return group_infos + + +def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + + handshake_req = DragonHandshakeRequest() + handshake_resp = dragon_backend.process_request(handshake_req) + + assert isinstance(handshake_resp, DragonHandshakeResponse) + assert handshake_resp.dragon_pid == 99999 + + +def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + ) + + run_resp = dragon_backend.process_request(run_req) + assert isinstance(run_resp, DragonRunResponse) + + step_id = run_resp.step_id + assert dragon_backend._queued_steps[step_id] == run_req + + dragon_backend._start_steps() + + assert dragon_backend._running_steps == [step_id] + assert len(dragon_backend._queued_steps) == 0 + assert len(dragon_backend._free_hosts) == 1 + assert dragon_backend._allocated_hosts["node1"] == step_id + assert dragon_backend._allocated_hosts["node2"] == step_id + + dragon_backend._group_infos[step_id].puids = [123] + monkeypatch.setattr( + dragon_backend._group_infos[step_id].process_group, "status", "Running" + ) + + dragon_backend._update() + + assert dragon_backend._running_steps == [step_id] + assert len(dragon_backend._queued_steps) == 0 + assert len(dragon_backend._free_hosts) == 1 + assert dragon_backend._allocated_hosts["node1"] == step_id + assert dragon_backend._allocated_hosts["node2"] == step_id + + print(dragon_backend._group_infos) + + dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED + + dragon_backend._update() + assert not dragon_backend._running_steps + + +def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + + group_infos = set_mock_group_infos(monkeypatch, dragon_backend) + + status_update_request = DragonUpdateStatusRequest(step_ids=list(group_infos.keys())) + + status_update_response = dragon_backend.process_request(status_update_request) + + assert isinstance(status_update_response, DragonUpdateStatusResponse) + assert status_update_response.statuses == { + step_id: (grp_info.status, grp_info.return_codes) + for step_id, grp_info in group_infos.items() + } + + +def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + group_infos = set_mock_group_infos(monkeypatch, dragon_backend) + + running_steps = [ + step_id + for step_id, group in group_infos.items() + if group.status == SmartSimStatus.STATUS_RUNNING + ] + + step_id_to_stop = running_steps[0] + + stop_request = DragonStopRequest(step_id=step_id_to_stop) + + stop_response = dragon_backend.process_request(stop_request) + + assert isinstance(stop_response, DragonStopResponse) + assert len(dragon_backend._stop_requests) == 1 + + dragon_backend._update() + + assert len(dragon_backend._stop_requests) == 0 + assert ( + dragon_backend._group_infos[step_id_to_stop].status + == SmartSimStatus.STATUS_CANCELLED + ) + + assert len(dragon_backend._allocated_hosts) == 0 + assert len(dragon_backend._free_hosts) == 3 + + +@pytest.mark.parametrize( + "immediate, frontend_shutdown", + [[True, True], [True, False], [False, True], [False, False]], +) +def test_shutdown_request( + monkeypatch: pytest.MonkeyPatch, immediate: bool, frontend_shutdown: bool +) -> None: + monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "0") + dragon_backend = get_mock_backend(monkeypatch) + monkeypatch.setattr(dragon_backend, "_cooldown_period", 1) + _ = set_mock_group_infos(monkeypatch, dragon_backend) + + shutdown_req = DragonShutdownRequest( + immediate=immediate, frontend_shutdown=frontend_shutdown + ) + shutdown_resp = dragon_backend.process_request(shutdown_req) + + assert dragon_backend._shutdown_requested + assert isinstance(shutdown_resp, DragonShutdownResponse) + assert dragon_backend._can_shutdown == immediate + assert dragon_backend.frontend_shutdown == frontend_shutdown + + dragon_backend._update() + assert not dragon_backend.should_shutdown + time.sleep(dragon_backend._cooldown_period + 0.1) + dragon_backend._update() + + assert dragon_backend.should_shutdown == immediate + assert dragon_backend._has_cooled_down == immediate + + +@pytest.mark.parametrize("telemetry_flag", ["0", "1"]) +def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -> None: + monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", telemetry_flag) + dragon_backend = get_mock_backend(monkeypatch) + + expected_cooldown = ( + 2 * CONFIG.telemetry_frequency + 5 if int(telemetry_flag) > 0 else 5 + ) + + if telemetry_flag: + assert dragon_backend.cooldown_period == expected_cooldown + else: + assert dragon_backend.cooldown_period == expected_cooldown + + +def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + first_heartbeat = dragon_backend.last_heartbeat + assert dragon_backend.current_time > first_heartbeat + dragon_backend._heartbeat() + assert dragon_backend.last_heartbeat > first_heartbeat + + +@pytest.mark.parametrize("num_nodes", [1, 3, 100]) +def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=num_nodes, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + ) + + assert dragon_backend._can_honor(run_req)[0] == ( + num_nodes <= len(dragon_backend._hosts) + ) + + +def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + step_id = dragon_backend._get_new_id() + + assert step_id.endswith("0") + assert step_id != dragon_backend._get_new_id() From ab987500f1198e0eec9b62cbe67d760631994231 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 1 May 2024 00:36:09 +0200 Subject: [PATCH 047/101] Clarify header in Group Info table --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 4ea098559..fad2bd9b3 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -215,7 +215,7 @@ def _proc_group_info_table_line( @property def step_table(self) -> str: - headers = ["Step", "Status", "Hosts", "Return codes", "PUIDS"] + headers = ["Step", "Status", "Hosts", "Return codes", "Num procs"] values = [] with self._queue_lock: From 4f81c10298491607a7d0ede951ca009e443a99ca Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 1 May 2024 01:21:46 +0000 Subject: [PATCH 048/101] Fix issue on PALS with truncated job names --- smartsim/_core/launcher/pbs/pbsLauncher.py | 17 ++++++++++++++++- smartsim/_core/launcher/pbs/pbsParser.py | 20 ++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index bb1b46d46..9951b9bbd 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -53,7 +53,11 @@ ) from ..stepInfo import PBSStepInfo, StepInfo from .pbsCommands import qdel, qstat -from .pbsParser import parse_qstat_jobid, parse_step_id_from_qstat +from .pbsParser import ( + parse_qstat_jobid, + parse_qstat_jobid_json, + parse_step_id_from_qstat, +) logger = get_logger(__name__) @@ -188,6 +192,17 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: qstat_out, _ = qstat(step_ids) stats = [parse_qstat_jobid(qstat_out, str(step_id)) for step_id in step_ids] + + # Fallback: if all jobs result as NOTFOUND, it might be an issue + # with truncated names, we resort to json format which does not truncate + # information + if all(stat == "NOTFOUND" for stat in stats): + qstat_out_json, _ = qstat(["-f", "-F", "json"] + step_ids) + stats = [ + parse_qstat_jobid_json(qstat_out_json, str(step_id)) + for step_id in step_ids + ] + # create PBSStepInfo objects to return for stat, _ in zip(stats, step_ids): diff --git a/smartsim/_core/launcher/pbs/pbsParser.py b/smartsim/_core/launcher/pbs/pbsParser.py index 362577595..dbbee70e9 100644 --- a/smartsim/_core/launcher/pbs/pbsParser.py +++ b/smartsim/_core/launcher/pbs/pbsParser.py @@ -83,6 +83,26 @@ def parse_qstat_jobid(output: str, job_id: str) -> str: return result +def parse_qstat_jobid_json(output: str, job_id: str) -> str: + """Parse and return output of the qstat command run with JSON options + to obtain job status. + + :param output: output of the qstat command in JSON format + :param job_id: allocation id or job step id + :return: status + """ + out_json = load_and_clean_json(output) + + if "Jobs" not in out_json: + return "NOTFOUND" + jobs: dict[str, t.Any] = out_json["Jobs"] + job: t.Optional[dict[str, t.Any]] = jobs.get(job_id, None) + if job is None: + return "NOTFOUND" + else: + return job.get("job_state", "NOTFOUND") + + def parse_qstat_nodes(output: str) -> t.List[str]: """Parse and return the qstat command run with options to obtain node list. From d03312e6628e8e7642fa71bc5909c61f7855d311 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 1 May 2024 14:49:28 +0000 Subject: [PATCH 049/101] Add docstrings --- smartsim/_core/launcher/dragon/dragonBackend.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index fad2bd9b3..f9ece4c5a 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -215,6 +215,8 @@ def _proc_group_info_table_line( @property def step_table(self) -> str: + """Table representation of all jobs which have been started on the server. + """ headers = ["Step", "Status", "Hosts", "Return codes", "Num procs"] values = [] @@ -228,6 +230,8 @@ def step_table(self) -> str: @property def host_table(self) -> str: + """Table representation of current state of nodes available in the allocation. + """ headers = ["Host", "State"] values = [] @@ -253,6 +257,10 @@ def __str__(self) -> str: return self.get_status_message() def get_status_message(self) -> str: + """Message with status of available nodes and history of launched jobs. + + :returns: Status message + """ msg = ["Dragon server backend update"] msg.append(self.host_table) msg.append(self.step_table) @@ -263,6 +271,10 @@ def _heartbeat(self) -> None: @property def cooldown_period(self) -> int: + """Time (in seconds) the server will wait before shutting down + + when exit conditions are met (see ``should_shutdown()`` for further details). + """ return self._cooldown_period @property From e4cbe8ca9d08ff367368c68d8b44dfaa49687103 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 1 May 2024 20:01:08 +0200 Subject: [PATCH 050/101] Fix lint and style --- smartsim/_core/launcher/dragon/dragonBackend.py | 7 ++++--- smartsim/_core/launcher/pbs/pbsParser.py | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index f9ece4c5a..d37fd0e90 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -215,8 +215,7 @@ def _proc_group_info_table_line( @property def step_table(self) -> str: - """Table representation of all jobs which have been started on the server. - """ + """Table representation of all jobs which have been started on the server.""" headers = ["Step", "Status", "Hosts", "Return codes", "Num procs"] values = [] @@ -230,7 +229,9 @@ def step_table(self) -> str: @property def host_table(self) -> str: - """Table representation of current state of nodes available in the allocation. + """Table representation of current state of nodes available + + in the allocation. """ headers = ["Host", "State"] values = [] diff --git a/smartsim/_core/launcher/pbs/pbsParser.py b/smartsim/_core/launcher/pbs/pbsParser.py index dbbee70e9..603e825e7 100644 --- a/smartsim/_core/launcher/pbs/pbsParser.py +++ b/smartsim/_core/launcher/pbs/pbsParser.py @@ -99,8 +99,7 @@ def parse_qstat_jobid_json(output: str, job_id: str) -> str: job: t.Optional[dict[str, t.Any]] = jobs.get(job_id, None) if job is None: return "NOTFOUND" - else: - return job.get("job_state", "NOTFOUND") + return str(job.get("job_state", "NOTFOUND")) def parse_qstat_nodes(output: str) -> t.List[str]: From b846c4894e7b25a85bfa65512db9f62177a778b2 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:12:20 +0200 Subject: [PATCH 051/101] Update smartsim/_core/launcher/dragon/dragonBackend.py Co-authored-by: Matt Drozt --- smartsim/_core/launcher/dragon/dragonBackend.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index d37fd0e90..286e5ab5e 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -234,11 +234,11 @@ def host_table(self) -> str: in the allocation. """ headers = ["Host", "State"] - values = [] - with self._queue_lock: - for host in self._hosts: - values.append([host, "Free" if host in self._free_hosts else "Busy"]) + values = [ + self._proc_group_info_table_line(step, group_info) + for step, group_info in self._group_infos.items() + ] return tabulate(values, headers, disable_numparse=True, tablefmt="github") From 94e1094fda1f5ff08b6696ea75205f68a4f58ef5 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:12:57 +0200 Subject: [PATCH 052/101] Update smartsim/_core/launcher/dragon/dragonBackend.py Co-authored-by: Matt Drozt --- smartsim/_core/launcher/dragon/dragonBackend.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 286e5ab5e..ec0a78978 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -217,13 +217,11 @@ def _proc_group_info_table_line( def step_table(self) -> str: """Table representation of all jobs which have been started on the server.""" headers = ["Step", "Status", "Hosts", "Return codes", "Num procs"] - values = [] - with self._queue_lock: - for step, group_info in self._group_infos.items(): - values.append( - DragonBackend._proc_group_info_table_line(step, group_info) - ) + values = [ + self._proc_group_info_table_line(step, group_info) + for step, group_info in self._group_infos.items() + ] return tabulate(values, headers, disable_numparse=True, tablefmt="github") From fe507d111e15356ba764295328094789372a6e34 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:13:24 +0200 Subject: [PATCH 053/101] Update smartsim/_core/launcher/dragon/dragonBackend.py Co-authored-by: Matt Drozt --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index ec0a78978..1d00fb288 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -250,7 +250,7 @@ def _initialize_hosts(self) -> None: self._free_hosts: t.Deque[str] = collections.deque(self._hosts) """List of hosts on which steps can be launched""" self._allocated_hosts: t.Dict[str, str] = {} - """List of hosts on which a step is already running""" + """Mapping of hosts on which a step is already running to step ID""" def __str__(self) -> str: return self.get_status_message() From 6e63f747c5f3275b3a098bc5cf92c49c1e790efb Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:14:11 +0200 Subject: [PATCH 054/101] Update smartsim/_core/launcher/dragon/dragonBackend.py Co-authored-by: Matt Drozt --- smartsim/_core/launcher/dragon/dragonBackend.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 1d00fb288..badb8cd4d 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -260,10 +260,10 @@ def get_status_message(self) -> str: :returns: Status message """ - msg = ["Dragon server backend update"] - msg.append(self.host_table) - msg.append(self.step_table) - return "\n".join(msg) + return textwrap.dedent(f"""\ + Dragon server backend update + {self.host_table} + {self.step_table}""") def _heartbeat(self) -> None: self._last_beat = self.current_time From 936f8eed18377f8b4239738d5e67fb0f76133de0 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:16:06 +0200 Subject: [PATCH 055/101] Update smartsim/_core/launcher/dragon/dragonBackend.py Co-authored-by: Matt Drozt --- smartsim/_core/launcher/dragon/dragonBackend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index badb8cd4d..a9f41bc97 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -150,7 +150,8 @@ def __init__(self, pid: int) -> None: """ProcessGroup execution state information""" self._queue_lock = RLock() """Lock that needs to be acquired to access internal queues""" - self._step_id: int = 0 + self._step_ids = (f"{create_short_id_str()}-{id}" + for id in itertools.count()) """Incremental ID to assign to new steps prior to execution""" self._initialize_hosts() From accc4f93b3a099f4cc7bb9cd64ca000a171057d7 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:17:36 +0200 Subject: [PATCH 056/101] Initial post-review changes --- smartsim/_core/entrypoints/dragon.py | 45 +++++-- smartsim/_core/entrypoints/dragon_client.py | 110 ++++++++++++++---- smartsim/_core/entrypoints/redis.py | 3 +- .../_core/launcher/dragon/dragonBackend.py | 40 ++++--- .../_core/launcher/dragon/dragonConnector.py | 55 +++++---- smartsim/_core/launcher/step/dragonStep.py | 4 +- smartsim/settings/settings.py | 2 +- tests/test_dragon_backend.py | 16 ++- 8 files changed, 191 insertions(+), 84 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 9441bb21b..92ebd735f 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -50,6 +50,10 @@ from smartsim._core.utils.network import get_best_interface_and_address from smartsim.log import ContextThread, get_logger +""" +Dragon server entrypoint script +""" + logger = get_logger("Dragon Server") # kill is not catchable @@ -72,11 +76,6 @@ def handle_signal(signo: int, _frame: t.Optional[FrameType] = None) -> None: cleanup() -""" -Dragon server entrypoint script -""" - - def get_log_path() -> str: config = get_config() return config.dragon_log_filename @@ -99,9 +98,18 @@ def print_summary(network_interface: str, ip_address: str) -> None: ) -def restart_updater( +def start_updater( backend: DragonBackend, updater: t.Optional[ContextThread] ) -> ContextThread: + """Start the ``DragonBackend`` updater thread. + + If ``updater`` is not None, then it is first checked and if it + alive, no other thread is started. + + :param backend: The dragon backend for which the thread will be started + :param updater: An existing updater thread that might have to be replaced + :return: Running updater thread + """ # If the updater was started, check if it completed or died if updater is not None: updater.join(0.1) @@ -114,9 +122,22 @@ def restart_updater( def is_updater_healthy(backend: DragonBackend) -> bool: + """Check if the backend has been updated recently. + + The acceptable delay is defined as the server timeout plus the backend's cooldown + period. If the server timeout is set to `-1`, then the acceptable delay is set to + one minute plus the cooldown period. + + :param backend: The backend for which the updater's health is checked + :return: Whether the backend was updated recently + """ + server_timeout = get_config().dragon_server_timeout / 1000 + acceptable_delay = backend.cooldown_period + ( + 60.0 if server_timeout == -1 else server_timeout + ) heartbeat_delay = backend.current_time - backend.last_heartbeat - if heartbeat_delay > 30.0 + float(backend.cooldown_period): + if heartbeat_delay > acceptable_delay: logger.debug( f"Updater inactive for {heartbeat_delay:.2f} seconds, will request restart." ) @@ -125,9 +146,15 @@ def is_updater_healthy(backend: DragonBackend) -> bool: def updater_fallback(backend: DragonBackend, updater: ContextThread) -> ContextThread: + """Check if updater has updated the backend recently, if not, check its status + and start a new one if it is not alive. + :param backend: The dragon backend for which the udpater's health must be checked + :param updater: The updater thread which has to be checked and (possibly) replaced + :return: Running updater thread + """ if is_updater_healthy(backend): return updater - return restart_updater(backend, updater) + return start_updater(backend, updater) # pylint: disable-next=too-many-statements @@ -141,7 +168,7 @@ def run( dragon_head_socket.bind(dragon_head_address) dragon_backend = DragonBackend(pid=dragon_pid) - backend_updater = restart_updater(dragon_backend, None) + backend_updater = start_updater(dragon_backend, None) server = dragonSockets.as_server(dragon_head_socket) logger.debug(f"Listening to {dragon_head_address}") diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py index 4043d5308..799d16b6a 100644 --- a/smartsim/_core/entrypoints/dragon_client.py +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -25,12 +25,15 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import dataclasses import json import os import signal import sys import time import typing as t +from pathlib import Path +from types import FrameType import zmq @@ -43,34 +46,80 @@ ) from smartsim.log import get_logger -SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] +""" +Dragon server entrypoint script +""" logger = get_logger("Dragon Client") +SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] + + +@dataclasses.dataclass +class DragonClientEntrypointArgs: + submit: Path def cleanup() -> None: logger.debug("Cleaning up") -def main(args: argparse.Namespace) -> int: - +def parse_requests(request_filepath: Path) -> t.List[DragonRequest]: requests: t.List[DragonRequest] = [] - try: - with open(args.submit, "r", encoding="utf-8") as request_file: + with open(request_filepath, "r", encoding="utf-8") as request_file: req_strings = json.load(fp=request_file) - except FileNotFoundError: + except FileNotFoundError as e: logger.error( "Could not find file with run requests," - f"please check whether {args.submit} exists." + f"please check whether {request_filepath} exists." ) - return 1 - except json.JSONDecodeError: - logger.error(f"Could not decode request file {args.submit}.") - return 1 + raise e from None + except json.JSONDecodeError as e: + logger.error(f"Could not decode request file {request_filepath}.") + raise e from None + + requests = [request_registry.from_string(req_str) for req_str in req_strings] + + return requests + + +def parse_arguments(args: t.List[str]) -> DragonClientEntrypointArgs: + parser = argparse.ArgumentParser( + prefix_chars="+", + description="SmartSim Dragon Client Process, to be used in batch scripts", + ) + parser.add_argument("+submit", type=str, help="Path to request file", required=True) + args_ = parser.parse_args(args) + + if not args_.submit: + raise ValueError("Empty request file.") + + return DragonClientEntrypointArgs(submit=Path(args_.submit)) + + +def handle_signal(signo: int, _frame: t.Optional[FrameType] = None) -> None: + if not signo: + logger.info("Received signal with no signo") + else: + logger.info(f"Received signal {signo}") + cleanup() + - for req_str in req_strings: - requests.append(request_registry.from_string(req_str)) +def register_signal_handlers() -> None: + # make sure to register the cleanup before the start + # the process so our signaller will be able to stop + # the database process. + for sig in SIGNALS: + signal.signal(sig, handle_signal) + + +def execute_entrypoint(args: DragonClientEntrypointArgs) -> int: + + try: + requests = parse_requests(args.submit) + except Exception as e: + logger.error(f"Dragon client failed to parse request file", exc_info=True) + return os.EX_OSFILE requests.append(DragonShutdownRequest(immediate=False, frontend_shutdown=True)) @@ -87,7 +136,7 @@ def main(args: argparse.Namespace) -> int: logger.error( "Could not get Dragon Server PID and will not be able to monitor it." ) - return 1 + return os.EX_IOERR while True: try: @@ -97,20 +146,33 @@ def main(args: argparse.Namespace) -> int: logger.debug("Could not reach server, assuming backend has shut down") break - logger.info("Server has finished.") + logger.info("Client has finished.") - return 0 + return os.EX_OK -if __name__ == "__main__": +def main(args_: t.List[str]) -> int: + """Execute the dragon client entrypoint as a module""" + os.environ["PYTHONUNBUFFERED"] = "1" logger.info("Dragon client started") - parser = argparse.ArgumentParser( - prefix_chars="+", - description="SmartSim Dragon Client Process, to be used in batch scripts", - ) - parser.add_argument("+submit", type=str, help="Path to request file", required=True) - args_ = parser.parse_args() + args = parse_arguments(args_) + register_signal_handlers() + + try: + return execute_entrypoint(args) + except Exception: + logger.error( + "An unexpected error occurred in the Dragon client entrypoint", + exc_info=True, + ) + finally: + cleanup() + + return os.EX_SOFTWARE + + +if __name__ == "__main__": - sys.exit(main(args_)) + sys.exit(main(sys.argv[1:])) diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index 6904d434a..f5a70b025 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -122,7 +122,8 @@ def main(args: argparse.Namespace) -> int: print(line.decode("utf-8").rstrip(), flush=True) except Exception as e: cleanup() - raise SSInternalError("Database process starter raised an exception") from e + logger.error(f"Database process starter raised an exception", exc_info=True) + return 1 return 0 diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index d37fd0e90..ec81d681a 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -28,6 +28,7 @@ import time import typing as t from dataclasses import dataclass, field +from enum import Enum from threading import RLock from tabulate import tabulate @@ -36,19 +37,11 @@ # isort: off import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy +import dragon.native.group_state as dragon_group_state import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group import dragon.native.machine as dragon_machine -# from dragon.infrastructure.connection import Connection -# from dragon.infrastructure.policy import Policy -# from dragon.native.process import Process, ProcessTemplate, Popen -# from dragon.native.process_group import ( -# ProcessGroup, -# DragonProcessGroupError, -# ) -# from dragon.native.machine import System, Node - # pylint: enable=import-error # isort: on from ...._core.config import get_config @@ -70,12 +63,15 @@ from ....log import get_logger from ....status import TERMINAL_STATUSES, SmartSimStatus -DRG_ERROR_STATUS = "Error" -DRG_RUNNING_STATUS = "Running" - logger = get_logger(__name__) +class DragonStatus(str, Enum): + ERROR = str(dragon_group_state.Error()) + RUNNING = str(dragon_group_state.Running()) + def __str__(self) -> str: + return self.value + @dataclass class ProcessGroupInfo: status: SmartSimStatus @@ -148,6 +144,8 @@ def __init__(self, pid: int) -> None: """PID of dragon executable which launched this server""" self._group_infos: t.Dict[str, ProcessGroupInfo] = {} """ProcessGroup execution state information""" + self._step_id_lock = RLock() + """Lock used to atomically create new step ids""" self._queue_lock = RLock() """Lock that needs to be acquired to access internal queues""" self._step_id: int = 0 @@ -347,9 +345,10 @@ def _allocate_step( return to_allocate def _get_new_id(self) -> str: - step_id = create_short_id_str() + "-" + str(self._step_id) - self._step_id += 1 - return step_id + with self._step_id_lock: + step_id = create_short_id_str() + "-" + str(self._step_id) + self._step_id += 1 + return step_id @staticmethod def _create_redirect_workers( @@ -406,7 +405,7 @@ def _stop_steps(self) -> None: proc_group = self._group_infos[step_id].process_group if ( proc_group is not None - and proc_group.status == DRG_RUNNING_STATUS + and proc_group.status == DragonStatus.RUNNING ): try: proc_group.kill() @@ -508,6 +507,10 @@ def _start_steps(self) -> None: f"Could not redirect stdout and stderr for PUIDS {puids}" ) from e self._group_infos[step_id].redir_workers = redir_grp + elif puids is not None and grp_status == SmartSimStatus.STATUS_RUNNING: + logger.error("Cannot redirect workers: some PUIDS are missing") + + if started: logger.debug(f"{started=}") @@ -533,7 +536,8 @@ def _refresh_statuses(self) -> None: group_info.status = SmartSimStatus.STATUS_FAILED group_info.return_codes = [-1] elif group_info.status not in TERMINAL_STATUSES: - if grp.status == DRG_RUNNING_STATUS: + print(grp.status, str(DragonStatus.RUNNING), grp.status==str(DragonStatus.RUNNING)) + if grp.status == str(DragonStatus.RUNNING): group_info.status = SmartSimStatus.STATUS_RUNNING else: puids = group_info.puids @@ -554,7 +558,7 @@ def _refresh_statuses(self) -> None: group_info.status = ( SmartSimStatus.STATUS_FAILED if any(group_info.return_codes) - or grp.status == DRG_ERROR_STATUS + or grp.status == DragonStatus.ERROR else SmartSimStatus.STATUS_COMPLETED ) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index b1fa0d469..02eb5b3a3 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -27,6 +27,7 @@ from __future__ import annotations import atexit +from collections import defaultdict import fileinput import itertools import json @@ -167,42 +168,38 @@ def _get_new_authenticator( @staticmethod def _get_dragon_log_level() -> str: - smartsim_to_dragon = { - "developer": "INFO", - "debug": "NONE", - "info": "NONE", - "quiet": "NONE", - } + smartsim_to_dragon = defaultdict(lambda: "NONE") + smartsim_to_dragon["developer"] = "INFO" return smartsim_to_dragon.get(get_config().log_level, "NONE") def _connect_to_existing_server(self, path: Path) -> None: config = get_config() dragon_config_log = path / config.dragon_log_filename - if dragon_config_log.is_file(): + if not dragon_config_log.is_file(): + return + + dragon_confs = self._parse_launched_dragon_server_info_from_files( + [dragon_config_log] + ) + logger.debug(dragon_confs) - dragon_confs = self._parse_launched_dragon_server_info_from_files( - [dragon_config_log] + for dragon_conf in dragon_confs: + logger.debug( + "Found dragon server config file. Checking if the server" + f" is still up at address {dragon_conf['address']}." ) - logger.debug(dragon_confs) - for dragon_conf in dragon_confs: - if not "address" in dragon_conf: - continue - logger.debug( - "Found dragon server config file. Checking if the server" - f" is still up at address {dragon_conf['address']}." - ) - try: - self._reset_timeout() - self._get_new_authenticator(-1) - self._handshake(dragon_conf["address"]) - except SmartSimError as e: - logger.error(e) - finally: - self._reset_timeout(config.dragon_server_timeout) - if self.is_connected: - logger.debug("Connected to existing Dragon server") - return + try: + self._reset_timeout() + self._get_new_authenticator(-1) + self._handshake(dragon_conf["address"]) + except SmartSimError as e: + logger.error(e) + finally: + self._reset_timeout(config.dragon_server_timeout) + if self.is_connected: + logger.debug("Connected to existing Dragon server") + return def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: config = get_config() @@ -351,6 +348,8 @@ def _parse_launched_dragon_server_info_from_iterable( ) dragon_envs = [json.loads(config_dict) for config_dict in dragon_env_jsons] + dragon_envs = [dragon_env for dragon_env in dragon_envs if "address" in dragon_env] + if num_dragon_envs: sliced_dragon_envs = itertools.islice(dragon_envs, num_dragon_envs) return list(sliced_dragon_envs) diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index 1ec9989f5..8f95b9839 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -154,11 +154,11 @@ def add_to_batch(self, step: Step) -> None: def _dragon_entrypoint_cmd(request_file: str) -> str: """Return command needed to run the Dragon entrypoint""" cmd = [ - f"{sys.executable}", + sys.executable, "-m", "smartsim._core.entrypoints.dragon_client", "+submit", - f"{request_file}", + request_file, ] return " ".join(cmd) diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index 1e0b475f6..b7982a2dc 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -89,7 +89,7 @@ def create_batch_settings( if launcher in ["auto", "dragon"]: launcher = detect_launcher() if launcher == "dragon": - by_launcher["dragon"] = by_launcher["launcher"] + by_launcher["dragon"] = by_launcher[launcher] if launcher == "local": raise SmartSimError("Local launcher does not support batch workloads") diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index cd6e85925..87fcd04c0 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -48,6 +48,15 @@ class NodeMock(MagicMock): def hostname(self) -> str: return create_short_id_str() +class GroupStateMock(MagicMock): + def Running(self) -> MagicMock: + running = MagicMock(**{"__str__.return_value": "Running"}) + return running + + def Error(self) -> MagicMock: + error = MagicMock(**{"__str__.return_value": "Error"}) + return error + def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": @@ -63,6 +72,7 @@ def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": **{ "native.machine.Node.return_value": node_mock, "native.machine.System.return_value": system_mock, + "native.group_state": GroupStateMock(), } ), ) @@ -78,6 +88,8 @@ def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": ) monkeypatch.setitem(sys.modules, "dragon.native.process", process_module_mock) monkeypatch.setitem(sys.modules, "dragon.native.process_group", MagicMock()) + + monkeypatch.setitem(sys.modules, "dragon.native.group_state", GroupStateMock()) monkeypatch.setitem( sys.modules, "dragon.native.machine", @@ -186,6 +198,7 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: step_id = run_resp.step_id assert dragon_backend._queued_steps[step_id] == run_req + dragon_backend._group_infos[step_id].puids = [123,124] dragon_backend._start_steps() assert dragon_backend._running_steps == [step_id] @@ -194,13 +207,14 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend._allocated_hosts["node1"] == step_id assert dragon_backend._allocated_hosts["node2"] == step_id - dragon_backend._group_infos[step_id].puids = [123] monkeypatch.setattr( dragon_backend._group_infos[step_id].process_group, "status", "Running" ) dragon_backend._update() + print("-----____--_-_-_----__\n--__--___--_--__-------") + assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 assert len(dragon_backend._free_hosts) == 1 From 29fa5749cf8b635ca919dec305a21376f5657970 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:33:12 +0200 Subject: [PATCH 057/101] Update smartsim/_core/launcher/dragon/dragonConnector.py Co-authored-by: Matt Drozt --- smartsim/_core/launcher/dragon/dragonConnector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index b1fa0d469..65ea700fe 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -349,12 +349,12 @@ def _parse_launched_dragon_server_info_from_iterable( for first, config_dict in tokenized if "DRAGON_SERVER_CONFIG" in first ) - dragon_envs = [json.loads(config_dict) for config_dict in dragon_env_jsons] + dragon_envs = (json.loads(config_dict) for config_dict in dragon_env_jsons) if num_dragon_envs: sliced_dragon_envs = itertools.islice(dragon_envs, num_dragon_envs) return list(sliced_dragon_envs) - return dragon_envs + return list(dragon_envs) @classmethod def _parse_launched_dragon_server_info_from_files( From 5a59cc87407acfc53e07e923b3621f15d89b8b0f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:38:30 +0200 Subject: [PATCH 058/101] Update smartsim/_core/launcher/dragon/dragonLauncher.py Co-authored-by: Matt Drozt --- smartsim/_core/launcher/dragon/dragonLauncher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 46405348c..4c0e2b45f 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -287,7 +287,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: logger.error(msg) info = StepInfo( SmartSimStatus.STATUS_FAILED, - str(SmartSimStatus.STATUS_FAILED), + SmartSimStatus.STATUS_FAILED.value, -1, ) else: From 3e2cc16b5ca2c18051c8e0edb9398cee182f1b90 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:40:43 +0200 Subject: [PATCH 059/101] Update smartsim/_core/launcher/dragon/dragonLauncher.py Co-authored-by: Matt Drozt --- smartsim/_core/launcher/dragon/dragonLauncher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 4c0e2b45f..1c3f57111 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -302,7 +302,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: logger.error(_err_msg) else: grp_ret_code = None - info = StepInfo(status, str(status), grp_ret_code) + info = StepInfo(status, status.value, grp_ret_code) step_id_updates[step_id] = info From facf2380b825f890638bd0b9edecaffb839abf4e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:47:31 +0200 Subject: [PATCH 060/101] Update smartsim/_core/entrypoints/dragon_client.py Co-authored-by: Matt Drozt --- smartsim/_core/entrypoints/dragon_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py index 4043d5308..341575244 100644 --- a/smartsim/_core/entrypoints/dragon_client.py +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -97,7 +97,7 @@ def main(args: argparse.Namespace) -> int: logger.debug("Could not reach server, assuming backend has shut down") break - logger.info("Server has finished.") + logger.info("Client has finished.") return 0 From 048a5a63c781132ccfbd2f8ac2a62fc46cf11a7b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:48:29 +0200 Subject: [PATCH 061/101] Remove debug logs --- smartsim/_core/launcher/dragon/dragonBackend.py | 14 ++++---------- smartsim/_core/launcher/dragon/dragonConnector.py | 2 +- tests/test_dragon_backend.py | 8 ++------ 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 709c82490..3c1bf7a2f 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import collections import functools +import itertools import time import typing as t from dataclasses import dataclass, field @@ -32,6 +33,7 @@ from threading import RLock from tabulate import tabulate +import textwrap # pylint: disable=import-error # isort: off @@ -144,8 +146,6 @@ def __init__(self, pid: int) -> None: """PID of dragon executable which launched this server""" self._group_infos: t.Dict[str, ProcessGroupInfo] = {} """ProcessGroup execution state information""" - self._step_id_lock = RLock() - """Lock used to atomically create new step ids""" self._queue_lock = RLock() """Lock that needs to be acquired to access internal queues""" self._step_ids = (f"{create_short_id_str()}-{id}" @@ -313,7 +313,7 @@ def should_shutdown(self) -> bool: @property def current_time(self) -> float: """Current time for DragonBackend object, in seconds since the Epoch""" - return time.time_ns() / 1e9 + return time.time() def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]: """Check if request can be honored with resources available in the allocation. @@ -343,11 +343,6 @@ def _allocate_step( to_allocate.append(host) return to_allocate - def _get_new_id(self) -> str: - with self._step_id_lock: - step_id = create_short_id_str() + "-" + str(self._step_id) - self._step_id += 1 - return step_id @staticmethod def _create_redirect_workers( @@ -535,7 +530,6 @@ def _refresh_statuses(self) -> None: group_info.status = SmartSimStatus.STATUS_FAILED group_info.return_codes = [-1] elif group_info.status not in TERMINAL_STATUSES: - print(grp.status, str(DragonStatus.RUNNING), grp.status==str(DragonStatus.RUNNING)) if grp.status == str(DragonStatus.RUNNING): group_info.status = SmartSimStatus.STATUS_RUNNING else: @@ -629,7 +623,7 @@ def process_request(self, request: DragonRequest) -> DragonResponse: @process_request.register def _(self, request: DragonRunRequest) -> DragonRunResponse: - step_id = self._get_new_id() + step_id = next(self._step_ids) with self._queue_lock: honorable, err = self._can_honor(request) if not honorable: diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 02eb5b3a3..dd4f3019d 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -324,7 +324,7 @@ def cleanup(self) -> None: server_authenticator=self._authenticator, ) self._dragon_head_socket = None - self._dragon_head_pid = 0 + self._dragon_head_pid = None self._authenticator = None def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse: diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index 87fcd04c0..224c5356c 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -213,16 +213,12 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend._update() - print("-----____--_-_-_----__\n--__--___--_--__-------") - assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 assert len(dragon_backend._free_hosts) == 1 assert dragon_backend._allocated_hosts["node1"] == step_id assert dragon_backend._allocated_hosts["node2"] == step_id - print(dragon_backend._group_infos) - dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED dragon_backend._update() @@ -352,7 +348,7 @@ def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) - step_id = dragon_backend._get_new_id() + step_id = next(dragon_backend._step_ids) assert step_id.endswith("0") - assert step_id != dragon_backend._get_new_id() + assert step_id != next(dragon_backend._step_ids) From 348a01888364573a768e4c90bc276a4887d88f5e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:49:17 +0200 Subject: [PATCH 062/101] Fix post-merge --- smartsim/_core/launcher/dragon/dragonConnector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index e4e39a653..9b2e008e1 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -348,7 +348,7 @@ def _parse_launched_dragon_server_info_from_iterable( ) dragon_envs = (json.loads(config_dict) for config_dict in dragon_env_jsons) - dragon_envs = [dragon_env for dragon_env in dragon_envs if "address" in dragon_env] + dragon_envs = (dragon_env for dragon_env in dragon_envs if "address" in dragon_env) if num_dragon_envs: sliced_dragon_envs = itertools.islice(dragon_envs, num_dragon_envs) From 9407c586ee61af82b25eb43ecedf12a6f47ad469 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:54:31 +0200 Subject: [PATCH 063/101] Style and lint --- smartsim/_core/entrypoints/dragon_client.py | 5 +++-- smartsim/_core/entrypoints/redis.py | 5 ++--- smartsim/_core/launcher/dragon/dragonBackend.py | 10 ++++------ smartsim/_core/launcher/dragon/dragonConnector.py | 6 ++++-- tests/test_dragon_backend.py | 3 ++- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py index 799d16b6a..2e51ba603 100644 --- a/smartsim/_core/entrypoints/dragon_client.py +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -59,6 +59,7 @@ class DragonClientEntrypointArgs: submit: Path + def cleanup() -> None: logger.debug("Cleaning up") @@ -117,8 +118,8 @@ def execute_entrypoint(args: DragonClientEntrypointArgs) -> int: try: requests = parse_requests(args.submit) - except Exception as e: - logger.error(f"Dragon client failed to parse request file", exc_info=True) + except Exception: + logger.error("Dragon client failed to parse request file", exc_info=True) return os.EX_OSFILE requests.append(DragonShutdownRequest(immediate=False, frontend_shutdown=True)) diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index f5a70b025..c4d8cbbd6 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -37,7 +37,6 @@ from smartsim._core.utils.network import current_ip from smartsim.entity.dbnode import LaunchedShardData -from smartsim.error import SSInternalError from smartsim.log import get_logger logger = get_logger(__name__) @@ -120,9 +119,9 @@ def main(args: argparse.Namespace) -> int: for line in iter(process.stdout.readline, b""): print(line.decode("utf-8").rstrip(), flush=True) - except Exception as e: + except Exception: cleanup() - logger.error(f"Database process starter raised an exception", exc_info=True) + logger.error("Database process starter raised an exception", exc_info=True) return 1 return 0 diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 3c1bf7a2f..fa47dd310 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -26,6 +26,7 @@ import collections import functools import itertools +import textwrap import time import typing as t from dataclasses import dataclass, field @@ -33,7 +34,6 @@ from threading import RLock from tabulate import tabulate -import textwrap # pylint: disable=import-error # isort: off @@ -71,9 +71,11 @@ class DragonStatus(str, Enum): ERROR = str(dragon_group_state.Error()) RUNNING = str(dragon_group_state.Running()) + def __str__(self) -> str: return self.value + @dataclass class ProcessGroupInfo: status: SmartSimStatus @@ -148,8 +150,7 @@ def __init__(self, pid: int) -> None: """ProcessGroup execution state information""" self._queue_lock = RLock() """Lock that needs to be acquired to access internal queues""" - self._step_ids = (f"{create_short_id_str()}-{id}" - for id in itertools.count()) + self._step_ids = (f"{create_short_id_str()}-{id}" for id in itertools.count()) """Incremental ID to assign to new steps prior to execution""" self._initialize_hosts() @@ -343,7 +344,6 @@ def _allocate_step( to_allocate.append(host) return to_allocate - @staticmethod def _create_redirect_workers( global_policy: dragon_policy.Policy, @@ -504,8 +504,6 @@ def _start_steps(self) -> None: elif puids is not None and grp_status == SmartSimStatus.STATUS_RUNNING: logger.error("Cannot redirect workers: some PUIDS are missing") - - if started: logger.debug(f"{started=}") diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 9b2e008e1..fde07a858 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -27,7 +27,6 @@ from __future__ import annotations import atexit -from collections import defaultdict import fileinput import itertools import json @@ -36,6 +35,7 @@ import sys import time import typing as t +from collections import defaultdict from pathlib import Path from threading import RLock @@ -348,7 +348,9 @@ def _parse_launched_dragon_server_info_from_iterable( ) dragon_envs = (json.loads(config_dict) for config_dict in dragon_env_jsons) - dragon_envs = (dragon_env for dragon_env in dragon_envs if "address" in dragon_env) + dragon_envs = ( + dragon_env for dragon_env in dragon_envs if "address" in dragon_env + ) if num_dragon_envs: sliced_dragon_envs = itertools.islice(dragon_envs, num_dragon_envs) diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index 224c5356c..1419d47d5 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -48,6 +48,7 @@ class NodeMock(MagicMock): def hostname(self) -> str: return create_short_id_str() + class GroupStateMock(MagicMock): def Running(self) -> MagicMock: running = MagicMock(**{"__str__.return_value": "Running"}) @@ -198,7 +199,7 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: step_id = run_resp.step_id assert dragon_backend._queued_steps[step_id] == run_req - dragon_backend._group_infos[step_id].puids = [123,124] + dragon_backend._group_infos[step_id].puids = [123, 124] dragon_backend._start_steps() assert dragon_backend._running_steps == [step_id] From 2471ce05b99b570c6e7ebf6ecb65146bf6eb3c3a Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:55:43 +0200 Subject: [PATCH 064/101] Update tests/on_wlm/test_dragon.py Co-authored-by: Matt Drozt --- tests/on_wlm/test_dragon.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/on_wlm/test_dragon.py b/tests/on_wlm/test_dragon.py index 6f6f3e574..e6fd18397 100644 --- a/tests/on_wlm/test_dragon.py +++ b/tests/on_wlm/test_dragon.py @@ -93,8 +93,6 @@ def test_dragon_cannot_honor(wlmutils, test_dir): try: assert exp.get_status(model)[0] == SmartSimStatus.STATUS_FAILED - except Exception as e: - raise e from None finally: launcher: DragonLauncher = exp._control._launcher launcher.cleanup() From 0b09c7b65d0a366e60c368b7cf17907b83107d4b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:56:12 +0200 Subject: [PATCH 065/101] Update smartsim/_core/launcher/dragon/dragonConnector.py Co-authored-by: Matt Drozt --- smartsim/_core/launcher/dragon/dragonConnector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index fde07a858..d63bc5b70 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -438,7 +438,7 @@ def _dragon_cleanup( server_authenticator.stop() except Exception: print("Authenticator shutdown error") - finally: + else: print("Authenticator shutdown is complete") From 923b30888aaa710700221e8ddead298e4d5d5b53 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:57:05 +0200 Subject: [PATCH 066/101] Update tests/on_wlm/test_dragon.py Co-authored-by: Matt Drozt --- tests/on_wlm/test_dragon.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/on_wlm/test_dragon.py b/tests/on_wlm/test_dragon.py index e6fd18397..612ff355c 100644 --- a/tests/on_wlm/test_dragon.py +++ b/tests/on_wlm/test_dragon.py @@ -71,8 +71,6 @@ def test_dragon_exp_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch exp.start(model, block=True) try: assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED - except Exception as e: - raise e from None finally: launcher: DragonLauncher = exp._control._launcher launcher.cleanup() From 638c1469fcbcf7c26c21a6b688857dc0fe463e59 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 01:57:29 +0200 Subject: [PATCH 067/101] Update tests/on_wlm/test_dragon.py Co-authored-by: Matt Drozt --- tests/on_wlm/test_dragon.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/on_wlm/test_dragon.py b/tests/on_wlm/test_dragon.py index 612ff355c..a05d38141 100644 --- a/tests/on_wlm/test_dragon.py +++ b/tests/on_wlm/test_dragon.py @@ -49,8 +49,6 @@ def test_dragon_global_path(global_dragon_teardown, wlmutils, test_dir, monkeypa try: assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED - except Exception as e: - raise e from None finally: launcher: DragonLauncher = exp._control._launcher launcher.cleanup() From f8140774b6dac1bcc3bd46d38619657893cb42f7 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 09:31:25 -0500 Subject: [PATCH 068/101] Fix host table --- .../_core/launcher/dragon/dragonBackend.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index fa47dd310..9077654be 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -223,7 +223,13 @@ def step_table(self) -> str: for step, group_info in self._group_infos.items() ] - return tabulate(values, headers, disable_numparse=True, tablefmt="github") + return tabulate( + values, + headers, + disable_numparse=True, + tablefmt="github", + colalign=["left", "left", "left", "center", "center"], + ) @property def host_table(self) -> str: @@ -231,14 +237,15 @@ def host_table(self) -> str: in the allocation. """ - headers = ["Host", "State"] + headers = ["Host", "Status"] + + def _host_table_line(host): + return [host, "Free" if host in self._free_hosts else "Busy"] + with self._queue_lock: - values = [ - self._proc_group_info_table_line(step, group_info) - for step, group_info in self._group_infos.items() - ] + values = [_host_table_line(host) for host in self._hosts] - return tabulate(values, headers, disable_numparse=True, tablefmt="github") + return tabulate(values, headers, disable_numparse=True, tablefmt="github", colalign=["left", "center"]) def _initialize_hosts(self) -> None: with self._queue_lock: @@ -260,10 +267,7 @@ def get_status_message(self) -> str: :returns: Status message """ - return textwrap.dedent(f"""\ - Dragon server backend update - {self.host_table} - {self.step_table}""") + return f"Dragon server backend update\n{self.host_table}\n{self.step_table}" def _heartbeat(self) -> None: self._last_beat = self.current_time From 000cb0f39d8fa9afb071d4aa74039ecba0660381 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 12:09:54 -0500 Subject: [PATCH 069/101] It's always either lint or mypy --- smartsim/_core/launcher/dragon/dragonBackend.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 9077654be..b3d4cdac0 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -26,7 +26,6 @@ import collections import functools import itertools -import textwrap import time import typing as t from dataclasses import dataclass, field @@ -192,7 +191,7 @@ def __init__(self, pid: int) -> None: def _proc_group_info_table_line( step_id: str, proc_group_info: ProcessGroupInfo ) -> t.List[str]: - table_line = [step_id, f"{str(proc_group_info.status)}"] + table_line = [step_id, f"{proc_group_info.status.value}"] if proc_group_info.hosts is not None: table_line.append(f"{','.join(proc_group_info.hosts)}") @@ -218,6 +217,11 @@ def step_table(self) -> str: """Table representation of all jobs which have been started on the server.""" headers = ["Step", "Status", "Hosts", "Return codes", "Num procs"] with self._queue_lock: + colalign = ( + ["left", "left", "left", "center", "center"] + if len(self._group_infos) > 0 + else None + ) values = [ self._proc_group_info_table_line(step, group_info) for step, group_info in self._group_infos.items() @@ -228,7 +232,7 @@ def step_table(self) -> str: headers, disable_numparse=True, tablefmt="github", - colalign=["left", "left", "left", "center", "center"], + colalign=colalign, ) @property @@ -243,9 +247,12 @@ def _host_table_line(host): return [host, "Free" if host in self._free_hosts else "Busy"] with self._queue_lock: + colalign = ["left", "center"] if len(self._hosts) > 0 else None values = [_host_table_line(host) for host in self._hosts] - return tabulate(values, headers, disable_numparse=True, tablefmt="github", colalign=["left", "center"]) + return tabulate( + values, headers, disable_numparse=True, tablefmt="github", colalign=colalign + ) def _initialize_hosts(self) -> None: with self._queue_lock: From e84de08fef46996e8bf7bfe0d6d6b41d9a4b5453 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 12:28:40 -0500 Subject: [PATCH 070/101] Mypy --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index b3d4cdac0..f25389d25 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -243,7 +243,7 @@ def host_table(self) -> str: """ headers = ["Host", "Status"] - def _host_table_line(host): + def _host_table_line(host: str) -> list[str]: return [host, "Free" if host in self._free_hosts else "Busy"] with self._queue_lock: From f7ba611658a67a13bb7ddc51f15a9481384ccbae Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 13:36:40 -0500 Subject: [PATCH 071/101] Update teardown in DragonConnector --- smartsim/_core/launcher/dragon/dragonConnector.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index d63bc5b70..16c9b1bd5 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -413,13 +413,12 @@ def _dragon_cleanup( DragonConnector._send_req_with_socket( server_socket, DragonShutdownRequest(), recv_flags=zmq.NOBLOCK ) - except (zmq.error.ZMQError, zmq.Again) as e: + except zmq.error.ZMQError as e: # Can't use the logger as I/O file may be closed - print("Could not send shutdown request to dragon server") - print(f"ZMQ error: {e}", flush=True) - + if not isinstance(e, zmq.Again): + print("Could not send shutdown request to dragon server") + print(f"ZMQ error: {e}", flush=True) finally: - time.sleep(5) print("Sending shutdown request is complete") if server_process_pid and psutil.pid_exists(server_process_pid): From aca088eedd52daed2492df787414cb6a371916fc Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 23:47:39 +0200 Subject: [PATCH 072/101] Add Backend View class --- .../_core/launcher/dragon/dragonBackend.py | 162 +++++++++++------- tests/test_dragon_backend.py | 40 ++++- 2 files changed, 133 insertions(+), 69 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index f25389d25..9ff084ea8 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -168,8 +168,6 @@ def __init__(self, pid: int) -> None: self._heartbeat() self._last_update_time = self._last_beat """Time at which the status update was printed the last time""" - num_hosts = len(self._hosts) - host_string = str(num_hosts) + (" hosts" if num_hosts != 1 else " host") self._shutdown_requested = False """Whether the shutdown was requested to this server""" self._can_shutdown = False @@ -185,74 +183,31 @@ def __init__(self, pid: int) -> None: else 5 ) """Time in seconds needed to server to complete shutdown""" - logger.debug(f"{host_string} available for execution: {self._hosts}") - - @staticmethod - def _proc_group_info_table_line( - step_id: str, proc_group_info: ProcessGroupInfo - ) -> t.List[str]: - table_line = [step_id, f"{proc_group_info.status.value}"] - if proc_group_info.hosts is not None: - table_line.append(f"{','.join(proc_group_info.hosts)}") - else: - table_line.append("") - - if proc_group_info.return_codes is not None: - table_line.append( - f"{','.join(str(ret) for ret in proc_group_info.return_codes)}" - ) - else: - table_line.append("") + self._view = DragonBackendView(self) + logger.debug(self._view.host_desc) - if proc_group_info.puids is not None: - table_line.append(f"{len(proc_group_info.puids)}") - else: - table_line.append("") - return table_line @property - def step_table(self) -> str: - """Table representation of all jobs which have been started on the server.""" - headers = ["Step", "Status", "Hosts", "Return codes", "Num procs"] + def hosts(self) -> list[str]: with self._queue_lock: - colalign = ( - ["left", "left", "left", "center", "center"] - if len(self._group_infos) > 0 - else None - ) - values = [ - self._proc_group_info_table_line(step, group_info) - for step, group_info in self._group_infos.items() - ] - - return tabulate( - values, - headers, - disable_numparse=True, - tablefmt="github", - colalign=colalign, - ) + return self._hosts @property - def host_table(self) -> str: - """Table representation of current state of nodes available - - in the allocation. - """ - headers = ["Host", "Status"] - - def _host_table_line(host: str) -> list[str]: - return [host, "Free" if host in self._free_hosts else "Busy"] + def allocated_hosts(self) -> dict[str, str]: + with self._queue_lock: + return self._allocated_hosts + @property + def free_hosts(self) -> t.Deque[str]: with self._queue_lock: - colalign = ["left", "center"] if len(self._hosts) > 0 else None - values = [_host_table_line(host) for host in self._hosts] + return self._free_hosts - return tabulate( - values, headers, disable_numparse=True, tablefmt="github", colalign=colalign - ) + @property + def group_infos(self) -> dict[str, ProcessGroupInfo]: + with self._queue_lock: + return self._group_infos def _initialize_hosts(self) -> None: with self._queue_lock: @@ -267,14 +222,15 @@ def _initialize_hosts(self) -> None: """Mapping of hosts on which a step is already running to step ID""" def __str__(self) -> str: - return self.get_status_message() + return self.status_message - def get_status_message(self) -> str: + @property + def status_message(self) -> str: """Message with status of available nodes and history of launched jobs. :returns: Status message """ - return f"Dragon server backend update\n{self.host_table}\n{self.step_table}" + return f"Dragon server backend update\n{self._view.host_table}\n{self._view.step_table}" def _heartbeat(self) -> None: self._last_beat = self.current_time @@ -678,3 +634,85 @@ def _(self, request: DragonShutdownRequest) -> DragonShutdownResponse: self._can_shutdown |= request.immediate self._frontend_shutdown = request.frontend_shutdown return DragonShutdownResponse() + + +class DragonBackendView: + def __init__(self, backend: DragonBackend): + self._backend = backend + + @property + def host_desc(self) -> str: + hosts = self._backend.hosts + num_hosts = len(hosts) + host_string = str(num_hosts) + (" hosts" if num_hosts != 1 else " host") + return f"{host_string} available for execution: {hosts}" + + @staticmethod + def _proc_group_info_table_line( + step_id: str, proc_group_info: ProcessGroupInfo + ) -> t.List[str]: + table_line = [step_id, f"{proc_group_info.status.value}"] + + if proc_group_info.hosts is not None: + table_line.append(f"{','.join(proc_group_info.hosts)}") + else: + table_line.append("") + + if proc_group_info.return_codes is not None: + table_line.append( + f"{','.join(str(ret) for ret in proc_group_info.return_codes)}" + ) + else: + table_line.append("") + + if proc_group_info.puids is not None: + table_line.append(f"{len(proc_group_info.puids)}") + else: + table_line.append("") + + return table_line + + @property + def step_table(self) -> str: + """Table representation of all jobs which have been started on the server.""" + headers = ["Step", "Status", "Hosts", "Return codes", "Num procs"] + + group_infos = self._backend.group_infos + + colalign = ( + ["left", "left", "left", "center", "center"] + if len(group_infos) > 0 + else None + ) + values = [ + self._proc_group_info_table_line(step, group_info) + for step, group_info in group_infos.items() + ] + + return tabulate( + values, + headers, + disable_numparse=True, + tablefmt="github", + colalign=colalign, + ) + + @property + def host_table(self) -> str: + """Table representation of current state of nodes available + + in the allocation. + """ + headers = ["Host", "Status"] + hosts = self._backend.hosts + free_hosts = self._backend.free_hosts + + def _host_table_line(host: str) -> list[str]: + return [host, "Free" if host in free_hosts else "Busy"] + + colalign = ["left", "center"] if len(hosts) > 0 else None + values = [_host_table_line(host) for host in hosts] + + return tabulate( + values, headers, disable_numparse=True, tablefmt="github", colalign=colalign + ) diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index 1419d47d5..3ff75366e 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -26,6 +26,7 @@ import collections import sys +import textwrap import time from unittest.mock import MagicMock @@ -101,7 +102,6 @@ def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": from smartsim._core.launcher.dragon.dragonBackend import DragonBackend dragon_backend = DragonBackend(pid=99999) - monkeypatch.setattr(dragon_backend, "_hosts", ["node1", "node2", "node3"]) monkeypatch.setattr( dragon_backend, "_free_hosts", collections.deque(dragon_backend._hosts) ) @@ -140,7 +140,7 @@ def set_mock_group_infos( hosts[1:2], MagicMock(), ), - "c1091vz-3": ProcessGroupInfo( + "c101vz-3": ProcessGroupInfo( SmartSimStatus.STATUS_COMPLETED, MagicMock(), [125, 126], @@ -199,14 +199,17 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: step_id = run_resp.step_id assert dragon_backend._queued_steps[step_id] == run_req + mock_process_group = MagicMock(puids=[123,124]) + + dragon_backend._group_infos[step_id].process_group = mock_process_group dragon_backend._group_infos[step_id].puids = [123, 124] dragon_backend._start_steps() assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 assert len(dragon_backend._free_hosts) == 1 - assert dragon_backend._allocated_hosts["node1"] == step_id - assert dragon_backend._allocated_hosts["node2"] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id monkeypatch.setattr( dragon_backend._group_infos[step_id].process_group, "status", "Running" @@ -217,8 +220,8 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 assert len(dragon_backend._free_hosts) == 1 - assert dragon_backend._allocated_hosts["node1"] == step_id - assert dragon_backend._allocated_hosts["node2"] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED @@ -283,7 +286,7 @@ def test_shutdown_request( monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "0") dragon_backend = get_mock_backend(monkeypatch) monkeypatch.setattr(dragon_backend, "_cooldown_period", 1) - _ = set_mock_group_infos(monkeypatch, dragon_backend) + set_mock_group_infos(monkeypatch, dragon_backend) shutdown_req = DragonShutdownRequest( immediate=immediate, frontend_shutdown=frontend_shutdown @@ -353,3 +356,26 @@ def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: assert step_id.endswith("0") assert step_id != next(dragon_backend._step_ids) + + +def test_view(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + set_mock_group_infos(monkeypatch, dragon_backend) + hosts = dragon_backend.hosts + + expected_message = textwrap.dedent(f"""\ + Dragon server backend update + | Host | Status | + |---------|----------| + | {hosts[0]} | Busy | + | {hosts[1]} | Free | + | {hosts[2]} | Free | + | Step | Status | Hosts | Return codes | Num procs | + |----------|--------------|-----------------|----------------|-------------| + | abc123-1 | Running | {hosts[0]} | | 1 | + | del999-2 | Cancelled | {hosts[1]} | -9 | 1 | + | c101vz-3 | Completed | {hosts[1]},{hosts[2]} | 0 | 2 | + | 0ghjk1-4 | Failed | {hosts[2]} | -1 | 1 | + | ljace0-5 | NeverStarted | | | 0 |""") + + assert dragon_backend.status_message == expected_message From 6a9cf0ee688ecfa768d67f91601810d357964632 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 23:59:10 +0200 Subject: [PATCH 073/101] Lint, Mypy --- smartsim/_core/launcher/dragon/dragonBackend.py | 7 ++++--- smartsim/_core/launcher/dragon/dragonConnector.py | 1 - tests/test_dragon_backend.py | 10 ++++++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 9ff084ea8..ca815c005 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -187,8 +187,6 @@ def __init__(self, pid: int) -> None: self._view = DragonBackendView(self) logger.debug(self._view.host_desc) - - @property def hosts(self) -> list[str]: with self._queue_lock: @@ -230,7 +228,10 @@ def status_message(self) -> str: :returns: Status message """ - return f"Dragon server backend update\n{self._view.host_table}\n{self._view.step_table}" + return ( + "Dragon server backend update\n" + f"{self._view.host_table}\n{self._view.step_table}" + ) def _heartbeat(self) -> None: self._last_beat = self.current_time diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 16c9b1bd5..46b49cb93 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -33,7 +33,6 @@ import os import subprocess import sys -import time import typing as t from collections import defaultdict from pathlib import Path diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index 3ff75366e..57ac65e6c 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -60,9 +60,14 @@ def Error(self) -> MagicMock: return error +class ProcessGroupMock(MagicMock): + puids = [121, 122] + + def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": process_mock = MagicMock(returncode=0) + process_group_mock = MagicMock(**{"Process.return_value": ProcessGroupMock()}) process_module_mock = MagicMock() process_module_mock.Process = process_mock node_mock = NodeMock() @@ -75,6 +80,7 @@ def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": "native.machine.Node.return_value": node_mock, "native.machine.System.return_value": system_mock, "native.group_state": GroupStateMock(), + "native.process_group.ProcessGroup.return_value": ProcessGroupMock(), } ), ) @@ -89,7 +95,7 @@ def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": MagicMock(**{"Policy.return_value": MagicMock()}), ) monkeypatch.setitem(sys.modules, "dragon.native.process", process_module_mock) - monkeypatch.setitem(sys.modules, "dragon.native.process_group", MagicMock()) + monkeypatch.setitem(sys.modules, "dragon.native.process_group", process_group_mock) monkeypatch.setitem(sys.modules, "dragon.native.group_state", GroupStateMock()) monkeypatch.setitem( @@ -199,7 +205,7 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: step_id = run_resp.step_id assert dragon_backend._queued_steps[step_id] == run_req - mock_process_group = MagicMock(puids=[123,124]) + mock_process_group = MagicMock(puids=[123, 124]) dragon_backend._group_infos[step_id].process_group = mock_process_group dragon_backend._group_infos[step_id].puids = [123, 124] From f239a0d702ce2ceab35d771f53ab7fb756d59324 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 23:20:19 +0000 Subject: [PATCH 074/101] Adapt PBS parsing and tests to run on Polaris too --- smartsim/_core/launcher/pbs/pbsLauncher.py | 4 ++-- smartsim/_core/launcher/pbs/pbsParser.py | 12 +++++----- tests/full_wlm/test_generic_batch_launch.py | 5 ++++- .../full_wlm/test_generic_orc_launch_batch.py | 22 +++++++++++++++---- tests/test_pbs_parser.py | 20 +++++++++++++++++ 5 files changed, 50 insertions(+), 13 deletions(-) diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index 9951b9bbd..cb51812d7 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -196,7 +196,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: # Fallback: if all jobs result as NOTFOUND, it might be an issue # with truncated names, we resort to json format which does not truncate # information - if all(stat == "NOTFOUND" for stat in stats): + if all(stat is None for stat in stats): qstat_out_json, _ = qstat(["-f", "-F", "json"] + step_ids) stats = [ parse_qstat_jobid_json(qstat_out_json, str(step_id)) @@ -206,7 +206,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: # create PBSStepInfo objects to return for stat, _ in zip(stats, step_ids): - info = PBSStepInfo(stat, None) + info = PBSStepInfo(stat or "NOTFOUND", None) # account for case where job history is not logged by PBS if info.status == SmartSimStatus.STATUS_COMPLETED: info.returncode = 0 diff --git a/smartsim/_core/launcher/pbs/pbsParser.py b/smartsim/_core/launcher/pbs/pbsParser.py index 603e825e7..0aeaee440 100644 --- a/smartsim/_core/launcher/pbs/pbsParser.py +++ b/smartsim/_core/launcher/pbs/pbsParser.py @@ -61,7 +61,7 @@ def parse_qsub_error(output: str) -> str: return base_err -def parse_qstat_jobid(output: str, job_id: str) -> str: +def parse_qstat_jobid(output: str, job_id: str) -> t.Optional[str]: """Parse and return output of the qstat command run with options to obtain job status. @@ -72,7 +72,7 @@ def parse_qstat_jobid(output: str, job_id: str) -> str: :return: status :rtype: str """ - result = "NOTFOUND" + result = None for line in output.split("\n"): fields = line.split() if len(fields) >= 5: @@ -83,7 +83,7 @@ def parse_qstat_jobid(output: str, job_id: str) -> str: return result -def parse_qstat_jobid_json(output: str, job_id: str) -> str: +def parse_qstat_jobid_json(output: str, job_id: str) -> t.Optional[str]: """Parse and return output of the qstat command run with JSON options to obtain job status. @@ -94,12 +94,12 @@ def parse_qstat_jobid_json(output: str, job_id: str) -> str: out_json = load_and_clean_json(output) if "Jobs" not in out_json: - return "NOTFOUND" + return None jobs: dict[str, t.Any] = out_json["Jobs"] job: t.Optional[dict[str, t.Any]] = jobs.get(job_id, None) if job is None: - return "NOTFOUND" - return str(job.get("job_state", "NOTFOUND")) + return None + return str(job.get("job_state", None)) def parse_qstat_nodes(output: str) -> t.List[str]: diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index dc9878d6c..0958b9652 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -45,7 +45,10 @@ def add_batch_resources(wlmutils, batch_settings): if isinstance(batch_settings, QsubBatchSettings): for key, value in wlmutils.get_batch_resources().items(): - batch_settings.set_resource(key, value) + if key == "queue": + batch_settings.set_queue(value) + else: + batch_settings.set_resource(key, value) def test_batch_model(fileutils, test_dir, wlmutils): diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index fc725f4b3..a80310a12 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -30,6 +30,7 @@ import pytest from smartsim import Experiment +from smartsim.settings.pbsSettings import QsubBatchSettings from smartsim.status import SmartSimStatus # retrieved from pytest fixtures @@ -42,6 +43,15 @@ ) +def add_batch_resources(wlmutils, batch_settings): + if isinstance(batch_settings, QsubBatchSettings): + for key, value in wlmutils.get_batch_resources().items(): + if key == "queue": + batch_settings.set_queue(value) + else: + batch_settings.set_resource(key, value) + + def test_launch_orc_auto_batch(test_dir, wlmutils): """test single node orchestrator""" launcher = wlmutils.get_test_launcher() @@ -59,8 +69,9 @@ def test_launch_orc_auto_batch(test_dir, wlmutils): ) orc.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, orc.batch_settings) - orc.batch_settings.set_walltime("00:02:00") + orc.batch_settings.set_walltime("00:05:00") orc.set_path(test_dir) exp.start(orc, block=True) @@ -95,8 +106,9 @@ def test_launch_cluster_orc_batch_single(test_dir, wlmutils): ) orc.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, orc.batch_settings) - orc.batch_settings.set_walltime("00:02:00") + orc.batch_settings.set_walltime("00:05:00") orc.set_path(test_dir) exp.start(orc, block=True) @@ -131,8 +143,9 @@ def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): ) orc.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, orc.batch_settings) - orc.batch_settings.set_walltime("00:03:00") + orc.batch_settings.set_walltime("00:05:00") orc.set_path(test_dir) exp.start(orc, block=True) @@ -162,8 +175,9 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): orc.set_path(test_dir) orc.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, orc.batch_settings) - orc.batch_settings.set_walltime("00:02:00") + orc.batch_settings.set_walltime("00:05:00") exp.start(orc, block=True) diff --git a/tests/test_pbs_parser.py b/tests/test_pbs_parser.py index f77eb7c93..ae01ffb19 100644 --- a/tests/test_pbs_parser.py +++ b/tests/test_pbs_parser.py @@ -72,3 +72,23 @@ def test_parse_qstat_status(): status = "R" parsed_status = pbsParser.parse_qstat_jobid(output, "1289903.sdb") assert status == parsed_status + + +def test_parse_qstat_status_not_found(): + output = ( + "Job id Name User Time Use S Queue\n" + "---------------- ---------------- ---------------- -------- - -----\n" + "1289903.sdb jobname username 00:00:00 R queue\n" + ) + parsed_status = pbsParser.parse_qstat_jobid(output, "9999999.sdb") + + assert parsed_status is None + + +def test_parse_qstat_status_json(fileutils): + """Parse nodes from qsub called with -f -F json""" + file_path = fileutils.get_test_conf_path("qstat.json") + output = Path(file_path).read_text() + status = "R" + parsed_status = pbsParser.parse_qstat_jobid_json(output, "16705.sdb") + assert status == parsed_status From 100d39c313ee468cf8a37220e7f9c9f92e6308d7 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 May 2024 23:47:07 +0000 Subject: [PATCH 075/101] Add pals to supported batch settings --- smartsim/settings/settings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index b7982a2dc..b046a5316 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -84,6 +84,7 @@ def create_batch_settings( "pbs": QsubBatchSettings, "slurm": SbatchSettings, "lsf": BsubBatchSettings, + "pals": QsubBatchSettings, } if launcher in ["auto", "dragon"]: From b10cfa36eb78928a22d8312bd09a6b3fe7059552 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 6 May 2024 17:37:30 +0200 Subject: [PATCH 076/101] Fix DragonLauncher docstrings --- smartsim/_core/launcher/dragon/dragonLauncher.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 1c3f57111..ea919ba47 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -64,16 +64,22 @@ class DragonLauncher(WLMLauncher): All WLM launchers are capable of launching managed and unmanaged jobs. Managed jobs are queried through interaction with with WLM, - in this case Dragon. Unmanaged jobs are held in the TaskManager + in this case the Dragon server. Unmanaged jobs are held in the TaskManager and are managed through references to their launching process ID - i.e. a psutil.Popen object + i.e. a psutil.Popen object. + Batch Jobs are routed to either Slurm or PBS and their step ID + is stored, prefixed with the name of the scheduler, to allow + the Job Manager to interact with it. """ def __init__(self) -> None: super().__init__() self._connector = DragonConnector() + """Connector used to start and interact with the Dragon server""" self._slurm_launcher = SlurmLauncher() + """Slurm sub-launcher, used only for batch jobs""" self._pbs_launcher = PBSLauncher() + """PBS sub-launcher, used only for batch jobs""" @property def is_connected(self) -> bool: From 1490c6423b73a856f9eb3ed75cc7689d89b7fa64 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 6 May 2024 12:54:59 -0500 Subject: [PATCH 077/101] Update server can shutdown --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index ca815c005..b483af699 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -550,7 +550,7 @@ def _update_shutdown_status(self) -> None: and grp_info.process_group is None and grp_info.redir_workers is None for grp_info in self._group_infos.values() - ) + ) and self._shutdown_requested def _should_print_status(self) -> bool: if self.current_time - self._last_update_time > 10: From 457555b11b9e91b6d838cae07d9053357b50ce67 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 7 May 2024 12:20:36 -0500 Subject: [PATCH 078/101] Make style --- smartsim/_core/launcher/dragon/dragonBackend.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index b483af699..37706fed1 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -545,12 +545,15 @@ def _refresh_statuses(self) -> None: def _update_shutdown_status(self) -> None: self._heartbeat() with self._queue_lock: - self._can_shutdown |= all( - grp_info.status in TERMINAL_STATUSES - and grp_info.process_group is None - and grp_info.redir_workers is None - for grp_info in self._group_infos.values() - ) and self._shutdown_requested + self._can_shutdown |= ( + all( + grp_info.status in TERMINAL_STATUSES + and grp_info.process_group is None + and grp_info.redir_workers is None + for grp_info in self._group_infos.values() + ) + and self._shutdown_requested + ) def _should_print_status(self) -> bool: if self.current_time - self._last_update_time > 10: From 4fc18198b23ef0afdffd835d2eb235b81dc5fd01 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 7 May 2024 16:15:20 -0500 Subject: [PATCH 079/101] Add delay for dragon launcher teardown in tests --- conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conftest.py b/conftest.py index e327b5fc2..4ddd99ce5 100644 --- a/conftest.py +++ b/conftest.py @@ -168,6 +168,8 @@ def pytest_sessionfinish( break else: # kill all spawned processes + if CONFIG.test_launcher == "dragon": + time.sleep(5) kill_all_test_spawned_processes() From 1259d3e3d10b26f83ad626a953ffcbcf9a8ae701 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 7 May 2024 16:54:48 -0500 Subject: [PATCH 080/101] Make style --- smartsim/_core/launcher/dragon/dragonConnector.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 46b49cb93..6c47fc84e 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -258,9 +258,10 @@ def connect_to_dragon(self) -> None: dragon_out_file = path / "dragon_head.out" dragon_err_file = path / "dragon_head.err" - with open(dragon_out_file, "w", encoding="utf-8") as dragon_out, open( - dragon_err_file, "w", encoding="utf-8" - ) as dragon_err: + with ( + open(dragon_out_file, "w", encoding="utf-8") as dragon_out, + open(dragon_err_file, "w", encoding="utf-8") as dragon_err, + ): current_env = os.environ.copy() current_env.update({"PYTHONUNBUFFERED": "1"}) logger.debug(f"Starting Dragon environment: {' '.join(cmd)}") From 978698a54ccd37f9551bbcaf8ae43139828fcdc1 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 7 May 2024 17:59:09 -0500 Subject: [PATCH 081/101] Fix telemetry, disable test on dragon --- smartsim/_core/utils/telemetry/telemetry.py | 3 --- tests/test_dragon_backend.py | 10 ++++++++++ tests/test_telemetry_monitor.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py index 8c6a8291c..6d51e1ba2 100644 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -151,9 +151,6 @@ def set_launcher( if self._launcher is None: raise SmartSimError("Launcher init failed") - if isinstance(self._launcher, DragonLauncher): - self._launcher.connect_to_dragon(exp_dir) - self.job_manager.set_launcher(self._launcher) self.job_manager.start() diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index 57ac65e6c..c78b078f2 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -32,6 +32,16 @@ import pytest +try: + import dragon +except ImportError: + pass +else: + pytest.skip( + reason="Using dragon as launcher, not running Dragon unit tests", + allow_module_level=True + ) + from smartsim._core.config import CONFIG from smartsim._core.schemas.dragonRequests import * from smartsim._core.schemas.dragonResponses import * diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py index c1bfe2719..0031c7fe4 100644 --- a/tests/test_telemetry_monitor.py +++ b/tests/test_telemetry_monitor.py @@ -1295,7 +1295,7 @@ def _faux_updates(_self: WLMLauncher, _names: t.List[str]) -> t.List[StepInfo]: ctx.setattr(SlurmLauncher, "get_step_update", get_faux_update(status_in)) mani_handler = ManifestEventHandler("xyz", logger) - mani_handler.set_launcher("slurm") + mani_handler.set_launcher("slurm", test_dir) # prep a fake job to request updates for job_entity = JobEntity() From 91c99f7768cf38d71a180cf9ab790a118fdce046 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 7 May 2024 18:02:04 -0500 Subject: [PATCH 082/101] Revert addition of useless arg --- smartsim/_core/utils/telemetry/telemetry.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py index 6d51e1ba2..4301d9110 100644 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -141,7 +141,7 @@ def init_job_manager(self) -> None: self.job_manager.start() def set_launcher( - self, launcher_type: str, exp_dir: t.Union[str, "os.PathLike[str]"] + self, launcher_type: str ) -> None: """Set the launcher for the experiment :param launcher_type: the name of the workload manager used by the experiment @@ -176,9 +176,8 @@ def process_manifest(self, manifest_path: str) -> None: logger.error("Manifest content error", exc_info=True) return - exp_dir = pathlib.Path(manifest_path).parent.parent.parent if self._launcher is None: - self.set_launcher(manifest.launcher, exp_dir) + self.set_launcher(manifest.launcher) if not self._launcher: raise SmartSimError(f"Unable to set launcher from {manifest_path}") From 43c4d29164c93e02595653a742dd62df6b5357c6 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 7 May 2024 18:12:11 -0500 Subject: [PATCH 083/101] Commit test files --- tests/test_dragon_backend.py | 2 +- tests/test_telemetry_monitor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index c78b078f2..bf06d93eb 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -39,7 +39,7 @@ else: pytest.skip( reason="Using dragon as launcher, not running Dragon unit tests", - allow_module_level=True + allow_module_level=True, ) from smartsim._core.config import CONFIG diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py index 0031c7fe4..c1bfe2719 100644 --- a/tests/test_telemetry_monitor.py +++ b/tests/test_telemetry_monitor.py @@ -1295,7 +1295,7 @@ def _faux_updates(_self: WLMLauncher, _names: t.List[str]) -> t.List[StepInfo]: ctx.setattr(SlurmLauncher, "get_step_update", get_faux_update(status_in)) mani_handler = ManifestEventHandler("xyz", logger) - mani_handler.set_launcher("slurm", test_dir) + mani_handler.set_launcher("slurm") # prep a fake job to request updates for job_entity = JobEntity() From 3f9bbce2c65baad4847a21833c78c0eaf73c4542 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 8 May 2024 01:29:06 -0500 Subject: [PATCH 084/101] black --- smartsim/_core/utils/telemetry/telemetry.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py index 4301d9110..e9e4c46bc 100644 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -140,9 +140,7 @@ def init_job_manager(self) -> None: self.job_manager.set_launcher(self._launcher) self.job_manager.start() - def set_launcher( - self, launcher_type: str - ) -> None: + def set_launcher(self, launcher_type: str) -> None: """Set the launcher for the experiment :param launcher_type: the name of the workload manager used by the experiment """ From 0d6e8256fa5360d3e999fa2a095487cc45e1e3f9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 8 May 2024 19:50:29 +0200 Subject: [PATCH 085/101] Remove types from docstrings --- smartsim/_core/config/config.py | 2 +- smartsim/_core/launcher/dragon/dragonBackend.py | 2 -- .../_core/launcher/dragon/dragonConnector.py | 3 --- smartsim/_core/launcher/dragon/dragonLauncher.py | 6 ------ smartsim/_core/launcher/dragon/dragonSockets.py | 8 ++------ smartsim/_core/launcher/launcher.py | 2 -- smartsim/_core/launcher/step/dragonStep.py | 11 ----------- smartsim/_core/utils/network.py | 2 -- smartsim/_core/utils/security.py | 16 +++------------- smartsim/settings/dragonRunSettings.py | 15 --------------- 10 files changed, 6 insertions(+), 61 deletions(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 97a08eb1a..33f4270ee 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -275,7 +275,7 @@ def smartsim_key_path(self) -> str: variable `SMARTSIM_KEY_PATH`. :returns: The configured key path. - :rtype: str""" + """ default_path = Path.home() / ".smartsim" / "keys" return os.environ.get("SMARTSIM_KEY_PATH", str(default_path)) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 37706fed1..71012886a 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -116,9 +116,7 @@ def redir_worker(io_conn: dragon_connection.Connection, file_path: str) -> None: """Read stdout/stderr from the Dragon connection. :param io_conn: Dragon connection to stdout or stderr - :type io_conn: dragon.infrastructure.connection.Connection :param file_path: path to file to write to - :type file_path: str """ while io_conn is None or not io_conn.readable: time.sleep(0.1) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 6c47fc84e..065a36d44 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -400,11 +400,8 @@ def _dragon_cleanup( ) -> None: """Clean up resources used by the launcher. :param server_socket: (optional) Socket used to connect to dragon environment - :type server_socket: Optional[zmq.Socket] :param server_process_pid: (optional) Process ID of the dragon entrypoint - :type server_process_pid: Optional[int] :param server_authenticator: (optional) Authenticator used to secure sockets - :type server_authenticator: Optional[zmq.auth.thread.ThreadAuthenticator] """ try: if server_socket is not None: diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index ea919ba47..f33c19ad5 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -123,10 +123,8 @@ def run(self, step: Step) -> t.Optional[str]: """Run a job step through Slurm :param step: a job step instance - :type step: Step :raises LauncherError: if launch fails :return: job step id if job is managed - :rtype: str """ if not self.task_manager.actively_monitoring: @@ -204,9 +202,7 @@ def stop(self, step_name: str) -> StepInfo: """Step a job step :param step_name: name of the job to stop - :type step_name: str :return: update for job due to cancel - :rtype: StepInfo """ stepmap = self.step_mapping[step_name] @@ -241,9 +237,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: """Get step updates for Dragon-managed jobs :param step_ids: list of job step ids - :type step_ids: list[str] :return: list of updates for managed jobs - :rtype: list[StepInfo] """ step_id_updates: dict[str, StepInfo] = {} diff --git a/smartsim/_core/launcher/dragon/dragonSockets.py b/smartsim/_core/launcher/dragon/dragonSockets.py index ee6e644a0..80acd61a2 100644 --- a/smartsim/_core/launcher/dragon/dragonSockets.py +++ b/smartsim/_core/launcher/dragon/dragonSockets.py @@ -75,14 +75,11 @@ def get_secure_socket( """Create secured socket that consumes & produces encrypted messages :param context: ZMQ context object - :type context: zmq.Context :param socket_type: Type of ZMQ socket to create - :type socket_type: zmq.SocketType :param is_server: Pass `True` to secure the socket as server. Pass `False` to secure the socket as a client. - :type is_server: bool :returns: the secured socket prepared for sending encrypted messages - :rtype: zmq.Socket""" + """ config = get_config() socket: "Socket[t.Any]" = context.socket(socket_type) @@ -114,9 +111,8 @@ def get_authenticator( """Create an authenticator to handle encryption of ZMQ communications :param context: ZMQ context object - :type context: zmq.Context :returns: the activated `Authenticator` - :rtype: zmq.auth.thread.ThreadAuthenticator""" + """ # pylint: disable-next=global-statement global AUTHENTICATOR diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index ff3cf025f..1bf768065 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -75,9 +75,7 @@ def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: making it monitor the step. :param name: name of step to be added - :type name: str :param step_map: step map of added step - :type step_map: StepMap """ self.step_mapping[name] = step_map diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index 8f95b9839..036a9e565 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -49,11 +49,8 @@ def __init__(self, name: str, cwd: str, run_settings: DragonRunSettings) -> None """Initialize a srun job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: SrunSettings """ super().__init__(name, cwd, run_settings) self.managed = True @@ -67,7 +64,6 @@ def get_launch_cmd(self) -> t.List[str]: needed to launch this step :return: launch command - :rtype: list[str] """ run_settings = self.run_settings exe_cmd = [] @@ -112,11 +108,8 @@ def __init__( """Initialize a Slurm Sbatch step :param name: name of the entity to launch - :type name: str :param cwd: path to launch dir - :type cwd: str :param batch_settings: batch settings for entity - :type batch_settings: SbatchSettings """ super().__init__(name, cwd, batch_settings) self.steps: t.List[Step] = [] @@ -128,7 +121,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the launch command for the batch :return: launch command for the batch - :rtype: list[str] """ if isinstance(self.batch_settings, SbatchSettings): script = self._write_sbatch_script() @@ -145,7 +137,6 @@ def add_to_batch(self, step: Step) -> None: """Add a job step to this batch :param step: a job step instance e.g. DragonStep - :type step: Step """ self.steps.append(step) logger.debug(f"Added step command to batch for {step.name}") @@ -199,7 +190,6 @@ def _write_sbatch_script(self) -> str: """Write the PBS batch script :return: batch script path after writing - :rtype: str """ batch_script = self.get_step_file(ending=".sh") output, error = self.get_output_files() @@ -231,7 +221,6 @@ def _write_qsub_script(self) -> str: """Write the Slurm batch script :return: batch script path after writing - :rtype: str """ batch_script = self.get_step_file(ending=".sh") output, error = self.get_output_files() diff --git a/smartsim/_core/utils/network.py b/smartsim/_core/utils/network.py index 787c0f0aa..aaceb7fc6 100644 --- a/smartsim/_core/utils/network.py +++ b/smartsim/_core/utils/network.py @@ -104,9 +104,7 @@ def find_free_port(start: int = 0) -> int: """A 'good enough' way to find an open port to bind to :param start: The first port number to consider - :type start: int :returns: The first open port found - :rtype: int """ port_num = -1 while port_num < 0: diff --git a/smartsim/_core/utils/security.py b/smartsim/_core/utils/security.py index f6607a2e5..e6f84c81a 100644 --- a/smartsim/_core/utils/security.py +++ b/smartsim/_core/utils/security.py @@ -93,14 +93,10 @@ def __init__( """Initiailize a `KeyLocator` :param root_dir: root path where keys are persisted to disk - :type root_dir: pathlib.Path :param filename: the stem name of the key file - :type filename: str :param category: the category or use-case for the key (e.g. server) - :type category: str :param separate_keys: flag indicating if public and private keys should be persisted in separate, corresponding directories - :type separate_keys: bool """ # constants for standardized paths. @@ -170,13 +166,11 @@ def __init__( ) -> None: """Initialize a KeyManager instance. :param config: SmartSim configuration - :type config: Config :param as_server: flag to indicate when executing in the server context; set to `True` to avoid loading client secret key - :type as_server: bool :param as_client: flag to indicate when executing in the client context; set to `True` to avoid loading server secret key - :type as_client: bool""" + """ self._as_server = as_server """Set to `True` to return keys appropriate for the server context""" @@ -213,13 +207,10 @@ def _load_keypair(cls, locator: _KeyLocator, in_context: bool) -> KeyPair: """Load a specific `KeyPair` from disk :param locator: a `KeyLocator` that specifies the path to an existing key - :type locator: KeyLocator :param in_context: Boolean flag indicating if the keypair is the active context; ensures the public and private keys are both loaded when `True`. Only the public key is loaded when `False` - :type in_context: bool :returns: a KeyPair containing the loaded public/private key - :rtype: KeyPair """ # private keys contain public & private key parts key_path = locator.private if in_context else locator.public @@ -241,7 +232,7 @@ def _load_keys(self) -> t.Tuple[KeyPair, KeyPair]: components from the standard key paths for the associated experiment :returns: 2-tuple of `KeyPair` (server_keypair, client_keypair) - :rtype: Tuple[KeyPair, KeyPair]""" + ]""" try: server_keys = self._load_keypair(self._server_locator, self._as_server) client_keys = self._load_keypair(self._client_locator, self._as_client) @@ -260,7 +251,7 @@ def _move_public_key(cls, locator: _KeyLocator) -> None: :param locator: `KeyLocator` that determines the path to the key pair persisted in the same directory. - :type locator: KeyLocator""" + """ new_path = locator.private.with_suffix(locator.public.suffix) if new_path != locator.public: logger.debug(f"Moving key file from {locator.public} to {new_path}") @@ -286,7 +277,6 @@ def get_keys(self, create: bool = True) -> t.Tuple[KeyPair, KeyPair]: :param no_create: pass `no_create=True` to ensure keys are not created and only pre-existing keys can be loaded :returns: 2-tuple of `KeyPair` (server_keypair, client_keypair) - :rtype: Tuple[KeyPair, KeyPair] """ logger.debug(f"Loading keys, creation {'is' if create else 'not'} allowed") server_keys, client_keys = self._load_keys() diff --git a/smartsim/settings/dragonRunSettings.py b/smartsim/settings/dragonRunSettings.py index a098993b5..d888d867c 100644 --- a/smartsim/settings/dragonRunSettings.py +++ b/smartsim/settings/dragonRunSettings.py @@ -53,15 +53,10 @@ def __init__( parameters will launch on that allocation. :param exe: executable to run - :type exe: str :param exe_args: executable arguments, defaults to None - :type exe_args: list[str] | str, optional :param run_args: srun arguments without dashes, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional :param env_vars: environment variables for job, defaults to None - :type env_vars: dict[str, str], optional :param alloc: allocation ID if running on existing alloc, defaults to None - :type alloc: str, optional """ super().__init__( exe, @@ -78,7 +73,6 @@ def set_nodes(self, nodes: int) -> None: """Set the number of nodes :param nodes: number of nodes to run with - :type nodes: int """ self.run_args["nodes"] = nodes @@ -86,7 +80,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -101,7 +94,6 @@ def set_hostlist_from_file(self, file_path: str) -> None: """Use the contents of a file to set the node list :param file_path: Path to the hostlist file - :type file_path: str """ self.run_args["nodefile"] = file_path @@ -109,7 +101,6 @@ def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify a list of hosts to exclude for launching this job :param host_list: hosts to exclude - :type host_list: list[str] :raises TypeError: """ if isinstance(host_list, str): @@ -124,7 +115,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: """Set the number of cpus to use per task :param num_cpus: number of cpus to use per task - :type num_cpus: int """ self.run_args["cpus-per-task"] = cpus_per_task @@ -132,7 +122,6 @@ def set_tasks(self, tasks: int) -> None: """Set the number of tasks for this job :param tasks: number of tasks - :type tasks: int """ self.run_args["ntasks"] = tasks @@ -140,7 +129,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: """Set the number of tasks for this job :param tasks_per_node: number of tasks per node - :type tasks_per_node: int """ self.run_args["tasks-per-node"] = tasks_per_node @@ -148,7 +136,6 @@ def set_memory_per_node(self, memory_per_node: int) -> None: """Specify the real memory required per node :param memory_per_node: Amount of memory per node in megabytes - :type memory_per_node: int """ self.run_args["mem"] = f"{memory_per_node}M" @@ -158,7 +145,6 @@ def set_verbose_launch(self, verbose: bool) -> None: This sets ``--verbose`` :param verbose: Whether the job should be run verbosely - :type verbose: bool """ if verbose: self.run_args["verbose"] = None @@ -171,6 +157,5 @@ def set_walltime(self, walltime: str) -> None: format = "HH:MM:SS" :param walltime: wall time - :type walltime: str """ self.run_args["time"] = str(walltime) From ab94bed2b0c0aaeb77c968e83073a25abd3c71d3 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 9 May 2024 17:43:01 +0200 Subject: [PATCH 086/101] Add docs, fix inconsistencies --- doc/api/smartsim_api.rst | 22 + doc/changelog.rst | 807 ------------------ doc/dragon.rst | 154 ++++ doc/experiment.rst | 5 +- doc/index.rst | 1 + doc/installation_instructions/basic.rst | 28 +- smartsim/_core/config/config.py | 2 +- .../_core/launcher/dragon/dragonConnector.py | 3 + smartsim/settings/dragonRunSettings.py | 85 +- 9 files changed, 210 insertions(+), 897 deletions(-) delete mode 100644 doc/changelog.rst create mode 100644 doc/dragon.rst diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index bb6a02b8e..af5f98dc2 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -162,6 +162,28 @@ and within batch launches (e.g., ``QsubBatchSettings``) :members: +.. _dragonsettings_api: + +DragonRunSettings +----------------- + +``DragonRunSettings`` can be used on systems that support Slurm or +PBS, if Dragon is available in the Python environment (see `_dragon_install` +for instructions on how to install it through ``smart``). + +``DragonRunSettings`` can be used in interactive sessions (on allcation) +and within batch launches (i.e. ``SbatchSettings`` or ``QsubBatchSettings``, +for Slurm and PBS sessions, respectively). + +.. autosummary:: + DragonRunSettings.set_nodes + DragonRunSettings.set_tasks_per_node + +.. autoclass:: DragonRunSettings + :inherited-members: + :undoc-members: + :members: + .. _jsrun_api: diff --git a/doc/changelog.rst b/doc/changelog.rst deleted file mode 100644 index 56bf86db2..000000000 --- a/doc/changelog.rst +++ /dev/null @@ -1,807 +0,0 @@ -********* -Changelog -********* - -Listed here are the changes between each release of SmartSim -and SmartRedis. - -Jump to :ref:`SmartRedis Changelog ` - - -SmartSim -======== - -Development branch ------------------- - -To be released at some future point in time - -Description - -- Fix race condition for telemetry monitor -- Update watchdog dependency -- Historical output files stored under .smartsim directory -- Add option to build Torch backend without the Intel Math Kernel Library -- Fix ReadTheDocs build issue -- Promote device options to an Enum -- Update telemetry monitor, add telemetry collectors -- Add method to specify node features for a Slurm job -- Colo Orchestrator setup now blocks application start until setup finished -- ExecArgs handling correction -- ReadTheDocs config file added and enabled on PRs -- Enforce changelog updates -- Remove deprecated SmartSim modules -- SmartSim Documentation refactor -- Update the version of Redis from `7.0.4` to `7.2.4` -- Fix publishing of development docs -- Update Experiment API typing -- Minor enhancements to test suite -- Improve SmartSim experiment signal handlers - -Detailed Notes - -- Ensure the telemetry monitor is started prior to launching entities (SmartSim-PR549_) -- Update watchdog dependency from 3.x to 4.x, fix new type issues (SmartSim-PR540_) -- The dashboard needs to display historical logs, so log files are written - out under the .smartsim directory and files under the experiment - directory are symlinked to them. (SmartSim-PR532_) -- Add an option to smart build "--torch_with_mkl"/"--no_torch_with_mkl" to - prevent Torch from trying to link in the Intel Math Kernel Library. This - is needed because on machines that have the Intel compilers installed, the - Torch will unconditionally try to link in this library, however fails - because the linking flags are incorrect. (SmartSim-PR538_) -- Change type_extension and pydantic versions in readthedocs environment - to enable docs build. (SmartSim-PR537_) -- Promote devices to a dedicated Enum type throughout the SmartSim code base. -- Update the telemetry monitor to enable retrieval of metrics on a scheduled - interval. Switch basic experiment tracking telemetry to default to on. Add - database metric collectors. Improve telemetry monitor logging. Create - telemetry subpackage at `smartsim._core.utils.telemetry`. Refactor - telemetry monitor entrypoint. (SmartSim-PR460_) -- Users can now specify node features for a Slurm job through - ``SrunSettings.set_node_feature``. The method accepts a string - or list of strings. (SmartSim-PR529_) -- The request to the colocated entrypoints file within the shell script - is now a blocking process. Once the Orchestrator is setup, it returns - which moves the process to the background and allows the application to - start. This prevents the application from requesting a ML model or - script that has not been uploaded to the Orchestrator yet. (SmartSim-PR522_) -- Add checks and tests to ensure SmartSim users cannot initialize run settings - with a list of lists as the exe_args argument. (SmartSim-PR517_) -- Add readthedocs configuration file and enable readthedocs builds - on pull requests. Additionally added robots.txt file generation - when readthedocs environment detected. (SmartSim-PR512_) -- Add Github Actions workflow that checks if changelog is edited - on pull requests into develop. (SmartSim-PR518_) -- Removed deprecated SmartSim modules: slurm and mpirunSettings. - (SmartSim-PR514_) -- Implemented new structure of SmartSim documentation. Added examples - images and further detail of SmartSim components. (SmartSim-PR463_) -- Update Redis version to `7.2.4`. This change fixes an issue in the Redis - build scripts causing failures on Apple Silicon hosts. (SmartSim-PR507_) -- The container which builds the documentation for every merge to develop - was failing due to a lack of space within the container. This was fixed - by including an additional Github action that removes some unneeded - software and files that come from the default Github Ubuntu container. - (SmartSim-PR504_) -- Update the generic `t.Any` typehints in Experiment API. (SmartSim-PR501_) -- The CI will fail static analysis if common erroneous truthy checks are - detected. (SmartSim-PR524_) -- The CI will fail static analysis if a local variable used while potentially - undefined. (SmartSim-PR521_) -- Remove previously deprecated behavior present in test suite on machines with - Slurm and Open MPI. (SmartSim-PR520_) -- When calling ``Experiment.start`` SmartSim would register a signal handler - that would capture an interrupt signal (^C) to kill any jobs launched through - its ``JobManager``. This would replace the default (or user defined) signal - handler. SmartSim will now attempt to kill any launched jobs before calling - the previously registered signal handler. (SmartSim-PR535_) - -.. _SmartSim-PR549: https://github.com/CrayLabs/SmartSim/pull/549 -.. _SmartSim-PR540: https://github.com/CrayLabs/SmartSim/pull/540 -.. _SmartSim-PR532: https://github.com/CrayLabs/SmartSim/pull/532 -.. _SmartSim-PR538: https://github.com/CrayLabs/SmartSim/pull/538 -.. _SmartSim-PR537: https://github.com/CrayLabs/SmartSim/pull/537 -.. _SmartSim-PR498: https://github.com/CrayLabs/SmartSim/pull/498 -.. _SmartSim-PR460: https://github.com/CrayLabs/SmartSim/pull/460 -.. _SmartSim-PR512: https://github.com/CrayLabs/SmartSim/pull/512 -.. _SmartSim-PR535: https://github.com/CrayLabs/SmartSim/pull/535 -.. _SmartSim-PR529: https://github.com/CrayLabs/SmartSim/pull/529 -.. _SmartSim-PR522: https://github.com/CrayLabs/SmartSim/pull/522 -.. _SmartSim-PR521: https://github.com/CrayLabs/SmartSim/pull/521 -.. _SmartSim-PR524: https://github.com/CrayLabs/SmartSim/pull/524 -.. _SmartSim-PR520: https://github.com/CrayLabs/SmartSim/pull/520 -.. _SmartSim-PR518: https://github.com/CrayLabs/SmartSim/pull/518 -.. _SmartSim-PR517: https://github.com/CrayLabs/SmartSim/pull/517 -.. _SmartSim-PR514: https://github.com/CrayLabs/SmartSim/pull/514 -.. _SmartSim-PR512: https://github.com/CrayLabs/SmartSim/pull/512 -.. _SmartSim-PR507: https://github.com/CrayLabs/SmartSim/pull/507 -.. _SmartSim-PR504: https://github.com/CrayLabs/SmartSim/pull/504 -.. _SmartSim-PR501: https://github.com/CrayLabs/SmartSim/pull/501 -.. _SmartSim-PR463: https://github.com/CrayLabs/SmartSim/pull/463 - - -0.6.2 ------ - -Released on 16 February, 2024 - -Description - -- Patch SmartSim dependency version - - -Detailed Notes - -- A critical performance concern was identified and addressed in SmartRedis. A - patch fix was deployed, and SmartSim was updated to ensure users do not - inadvertently pull the unpatched version of SmartRedis. (SmartSim-PR493_) - - -.. _SmartSim-PR493: https://github.com/CrayLabs/SmartSim/pull/493 - - -0.6.1 ------ - -Released on 15 February, 2024 - -Description - -- Duplicate for DBModel/Script prevented -- Update license to include 2024 -- Telemetry monitor is now active by default -- Add support for Mac OSX on Apple Silicon -- Remove Torch warnings during testing -- Validate Slurm timing format -- Expose Python Typehints -- Fix test_logs to prevent generation of directory -- Fix Python Typehint for colocated database settings -- Python 3.11 Support -- Quality of life `smart validate` improvements -- Remove Cobalt support -- Enrich logging through context variables -- Upgrade Machine Learning dependencies -- Override sphinx-tabs background color -- Add concurrency group to test workflow -- Fix index when installing torch through smart build - - -Detailed Notes - -- Modify the `git clone` for both Redis and RedisAI to set the line endings to - unix-style line endings when using MacOS on ARM. (SmartSim-PR482_) -- Separate install instructions are now provided for Mac OSX on x64 vs ARM64 (SmartSim-PR479_) -- Prevent duplicate ML model and script names being added to an - Ensemble member if the names exists. (SmartSim-PR475_) -- Updates `Copyright (c) 2021-2023` to `Copyright (c) 2021-2024` - in all of the necessary files. (SmartSim-PR485_) -- Bug fix which prevents the expected behavior when the `SMARTSIM_LOG_LEVEL` - environment variable was set to `developer`. (SmartSim-PR473_) -- Sets the default value of the "enable telemetry" flag to on. - Bumps the output `manifest.json` version number to match that of - `smartdashboard` and pins a watchdog version to avoid build errors. - (SmartSim-PR477_) -- Refactor logic of `Manifest.has_db_objects` to remove excess branching - and improve readability/maintainability. (SmartSim-PR476_) -- SmartSim can now be built and used on platforms using Apple Silicon - (ARM64). Currently, only the PyTorch backend is supported. Note that libtorch - will be downloaded from a CrayLabs github repo. (SmartSim-PR465_) -- Tests that were saving Torch models were emitting warnings. These warnings - were addressed by updating the model save test function. (SmartSim-PR472_) -- Validate the timing format when requesting a slurm allocation. (SmartSim-PR471_) -- Add and ship `py.typed` marker to expose inline type hints. Fix - type errors related to SmartRedis. (SmartSim-PR468_) -- Fix the `test_logs.py::test_context_leak` test that was - erroneously creating a directory named `some value` in SmartSim's root - directory. (SmartSim-PR467_) -- Add Python type hinting to colocated settings. (SmartSim-PR462_) -- Add github actions for running black and isort checks. (SmartSim-PR464_) -- Relax the required version of `typing_extensions`. (SmartSim-PR459_) -- Addition of Python 3.11 to SmartSim. (SmartSim-PR461_) -- Quality of life `smart validate` improvements such as setting `CUDA_VISIBLE_DEVICES` - environment variable within `smart validate` prior to importing any ML deps to - prevent false negatives on multi-GPU systems. Additionally, move SmartRedis logs - from standard out to dedicated log file in the validation temporary directory as well as - suppress `sklearn` deprecation warning by pinning `KMeans` constructor - argument. Lastly, move TF test to last as TF may reserve the GPUs it uses. - (SmartSim-PR458_) -- Some actions in the current GitHub CI/CD workflows were outdated. They were - replaced with the latest versions. (SmartSim-PR446_) -- As the Cobalt workload manager is not used on any system we are aware of, - its support in SmartSim was terminated and classes such as `CobaltLauncher` have - been removed. (SmartSim-PR448_) -- Experiment logs are written to a file that can be read by the dashboard. (SmartSim-PR452_) -- Updated SmartSim's machine learning backends to PyTorch 2.0.1, Tensorflow - 2.13.1, ONNX 1.14.1, and ONNX Runtime 1.16.1. As a result of this change, - there is now an available ONNX wheel for use with Python 3.10, and wheels for - all of SmartSim's machine learning backends with Python 3.11. - (SmartSim-PR451_) (SmartSim-PR461_) -- The sphinx-tabs documentation extension uses a white background for the tabs component. - A custom CSS for those components to inherit the overall theme color has - been added. (SmartSim-PR453_) -- Add concurrency groups to GitHub's CI/CD workflows, preventing - multiple workflows from the same PR to be launched concurrently. - (SmartSim-PR439_) -- Torch changed their preferred indexing when trying to install - their provided wheels. Updated the `pip install` command within - `smart build` to ensure that the appropriate packages can be found. - (SmartSim-PR449_) - - -.. _SmartSim-PR485: https://github.com/CrayLabs/SmartSim/pull/485 -.. _SmartSim-PR482: https://github.com/CrayLabs/SmartSim/pull/482 -.. _SmartSim-PR479: https://github.com/CrayLabs/SmartSim/pull/479 -.. _SmartSim-PR477: https://github.com/CrayLabs/SmartSim/pull/477 -.. _SmartSim-PR476: https://github.com/CrayLabs/SmartSim/pull/476 -.. _SmartSim-PR475: https://github.com/CrayLabs/SmartSim/pull/475 -.. _SmartSim-PR473: https://github.com/CrayLabs/SmartSim/pull/473 -.. _SmartSim-PR472: https://github.com/CrayLabs/SmartSim/pull/472 -.. _SmartSim-PR471: https://github.com/CrayLabs/SmartSim/pull/471 -.. _SmartSim-PR468: https://github.com/CrayLabs/SmartSim/pull/468 -.. _SmartSim-PR467: https://github.com/CrayLabs/SmartSim/pull/467 -.. _SmartSim-PR465: https://github.com/CrayLabs/SmartSim/pull/465 -.. _SmartSim-PR464: https://github.com/CrayLabs/SmartSim/pull/464 -.. _SmartSim-PR462: https://github.com/CrayLabs/SmartSim/pull/462 -.. _SmartSim-PR461: https://github.com/CrayLabs/SmartSim/pull/461 -.. _SmartSim-PR459: https://github.com/CrayLabs/SmartSim/pull/459 -.. _SmartSim-PR458: https://github.com/CrayLabs/SmartSim/pull/458 -.. _SmartSim-PR453: https://github.com/CrayLabs/SmartSim/pull/453 -.. _SmartSim-PR452: https://github.com/CrayLabs/SmartSim/pull/452 -.. _SmartSim-PR451: https://github.com/CrayLabs/SmartSim/pull/451 -.. _SmartSim-PR449: https://github.com/CrayLabs/SmartSim/pull/449 -.. _SmartSim-PR448: https://github.com/CrayLabs/SmartSim/pull/448 -.. _SmartSim-PR446: https://github.com/CrayLabs/SmartSim/pull/446 -.. _SmartSim-PR439: https://github.com/CrayLabs/SmartSim/pull/439 - -0.6.0 ------ - -Released on 18 December, 2023 - -Description - -- Conflicting directives in the SmartSim packaging instructions were fixed -- `sacct` and `sstat` errors are now fatal for Slurm-based workflow executions -- Added documentation section about ML features and TorchScript -- Added TorchScript functions to Online Analysis tutorial -- Added multi-DB example to documentation -- Improved test stability on HPC systems -- Added support for producing & consuming telemetry outputs -- Split tests into groups for parallel execution in CI/CD pipeline -- Change signature of `Experiment.summary()` -- Expose first_device parameter for scripts, functions, models -- Added support for MINBATCHTIMEOUT in model execution -- Remove support for RedisAI 1.2.5, use RedisAI 1.2.7 commit -- Add support for multiple databases - -Detailed Notes - -- Several conflicting directives between the `setup.py` and the `setup.cfg` were fixed - to mitigate warnings issued when building the pip wheel. (SmartSim-PR435_) -- When the Slurm functions `sacct` and `sstat` returned an error, it would be ignored - and SmartSim's state could become inconsistent. To prevent this, errors - raised by `sacct` or `sstat` now result in an exception. (SmartSim-PR392_) -- A section named *ML Features* was added to documentation. It contains multiple - examples of how ML models and functions can be added to and executed on the DB. - TorchScript-based post-processing was added to the *Online Analysis* tutorial (SmartSim-PR411_) -- An example of how to use multiple Orchestrators concurrently was added to the documentation (SmartSim-PR409_) -- The test infrastructure was improved. Tests on HPC system are now stable, and issues such - as non-stopped `Orchestrators` or experiments created in the wrong paths have been fixed (SmartSim-PR381_) -- A telemetry monitor was added to check updates and produce events for SmartDashboard (SmartSim-PR426_) -- Split tests into `group_a`, `group_b`, `slow_tests` for parallel execution in CI/CD pipeline (SmartSim-PR417_, SmartSim-PR424_) -- Change `format` argument to `style` in `Experiment.summary()`, this is - an API break (SmartSim-PR391_) -- Added support for first_device parameter for scripts, functions, - and models. This causes them to be loaded to the first num_devices - beginning with first_device (SmartSim-PR394_) -- Added support for MINBATCHTIMEOUT in model execution, which caps the delay - waiting for a minimium number of model execution operations to accumulate - before executing them as a batch (SmartSim-PR387_) -- RedisAI 1.2.5 is not supported anymore. The only RedisAI version - is now 1.2.7. Since the officially released RedisAI 1.2.7 has a - bug which breaks the build process on Mac OSX, it was decided to - use commit 634916c_ from RedisAI's GitHub repository, where such - bug has been fixed. This applies to all operating systems. (SmartSim-PR383_) -- Add support for creation of multiple databases with unique identifiers. (SmartSim-PR342_) - - -.. _SmartSim-PR435: https://github.com/CrayLabs/SmartSim/pull/435 -.. _SmartSim-PR392: https://github.com/CrayLabs/SmartSim/pull/392 -.. _SmartSim-PR411: https://github.com/CrayLabs/SmartSim/pull/411 -.. _SmartSim-PR409: https://github.com/CrayLabs/SmartSim/pull/409 -.. _SmartSim-PR381: https://github.com/CrayLabs/SmartSim/pull/381 -.. _SmartSim-PR426: https://github.com/CrayLabs/SmartSim/pull/426 -.. _SmartSim-PR424: https://github.com/CrayLabs/SmartSim/pull/424 -.. _SmartSim-PR417: https://github.com/CrayLabs/SmartSim/pull/417 -.. _SmartSim-PR391: https://github.com/CrayLabs/SmartSim/pull/391 -.. _SmartSim-PR342: https://github.com/CrayLabs/SmartSim/pull/342 -.. _SmartSim-PR394: https://github.com/CrayLabs/SmartSim/pull/394 -.. _SmartSim-PR387: https://github.com/CrayLabs/SmartSim/pull/387 -.. _SmartSim-PR383: https://github.com/CrayLabs/SmartSim/pull/383 -.. _634916c: https://github.com/RedisAI/RedisAI/commit/634916c722e718cc6ea3fad46e63f7d798f9adc2 -.. _SmartSim-PR342: https://github.com/CrayLabs/SmartSim/pull/342 - - -0.5.1 ------ - -Released on 14 September, 2023 - -Description - -- Add typehints throughout the SmartSim codebase -- Provide support for Slurm heterogeneous jobs -- Provide better support for `PalsMpiexecSettings` -- Allow for easier inspection of SmartSim entities -- Log ignored error messages from `sacct` -- Fix colocated db preparation bug when using `JsrunSettings` -- Fix bug when user specify CPU and devices greater than 1 -- Fix bug when get_allocation called with reserved keywords -- Enabled mypy in CI for better type safety -- Mitigate additional suppressed pylint errors -- Update linting support and apply to existing errors -- Various improvements to the `smart` CLI -- Various documentation improvements -- Various test suite improvements - -Detailed Notes - -- Add methods to allow users to inspect files attached to models and ensembles. (SmartSim-PR352_) -- Add a `smart info` target to provide rudimentary information about the SmartSim installation. (SmartSim-PR350_) -- Remove unnecessary generation producing unexpected directories in the test suite. (SmartSim-PR349_) -- Add support for heterogeneous jobs to `SrunSettings` by allowing users to set the `--het-group` parameter. (SmartSim-PR346_) -- Provide clearer guidelines on how to contribute to SmartSim. (SmartSim-PR344_) -- Integrate `PalsMpiexecSettings` into the `Experiment` factory methods when using the `"pals"` launcher. (SmartSim-PR343_) -- Create public properties where appropriate to mitigate `protected-access` errors. (SmartSim-PR341_) -- Fix a failure to execute `_prep_colocated_db` due to incorrect named attr check. (SmartSim-PR339_) -- Enabled and mitigated mypy `disallow_any_generics` and `warn_return_any`. (SmartSim-PR338_) -- Add a `smart validate` target to provide a simple smoke test to assess a SmartSim build. (SmartSim-PR336_, SmartSim-PR351_) -- Add typehints to `smartsim._core.launcher.step.*`. (SmartSim-PR334_) -- Log errors reported from slurm WLM when attempts to retrieve status fail. (SmartSim-PR331_, SmartSim-PR332_) -- Fix incorrectly formatted positional arguments in log format strings. (SmartSim-PR330_) -- Ensure that launchers pass environment variables to unmanaged job steps. (SmartSim-PR329_) -- Add additional tests surrounding the `RAI_PATH` configuration environment variable. (SmartSim-PR328_) -- Remove unnecessary execution of unescaped shell commands. (SmartSim-PR327_) -- Add error if user calls get_allocation with reserved keywords in slurm get_allocation. (SmartSim-PR325_) -- Add error when user requests CPU with devices greater than 1 within add_ml_model and add_script. (SmartSim-PR324_) -- Update documentation surrounding ensemble key prefixing. (SmartSim-PR322_) -- Fix formatting of the Frontier site installation. (SmartSim-PR321_) -- Update pylint dependency, update .pylintrc, mitigate non-breaking issues, suppress api breaks. (SmartSim-PR311_) -- Refactor the `smart` CLI to use subparsers for better documentation and extension. (SmartSim-PR308_) - -.. _SmartSim-PR352: https://github.com/CrayLabs/SmartSim/pull/352 -.. _SmartSim-PR351: https://github.com/CrayLabs/SmartSim/pull/351 -.. _SmartSim-PR350: https://github.com/CrayLabs/SmartSim/pull/350 -.. _SmartSim-PR349: https://github.com/CrayLabs/SmartSim/pull/349 -.. _SmartSim-PR346: https://github.com/CrayLabs/SmartSim/pull/346 -.. _SmartSim-PR344: https://github.com/CrayLabs/SmartSim/pull/344 -.. _SmartSim-PR343: https://github.com/CrayLabs/SmartSim/pull/343 -.. _SmartSim-PR341: https://github.com/CrayLabs/SmartSim/pull/341 -.. _SmartSim-PR339: https://github.com/CrayLabs/SmartSim/pull/339 -.. _SmartSim-PR338: https://github.com/CrayLabs/SmartSim/pull/338 -.. _SmartSim-PR336: https://github.com/CrayLabs/SmartSim/pull/336 -.. _SmartSim-PR334: https://github.com/CrayLabs/SmartSim/pull/334 -.. _SmartSim-PR332: https://github.com/CrayLabs/SmartSim/pull/332 -.. _SmartSim-PR331: https://github.com/CrayLabs/SmartSim/pull/331 -.. _SmartSim-PR330: https://github.com/CrayLabs/SmartSim/pull/330 -.. _SmartSim-PR329: https://github.com/CrayLabs/SmartSim/pull/329 -.. _SmartSim-PR328: https://github.com/CrayLabs/SmartSim/pull/328 -.. _SmartSim-PR327: https://github.com/CrayLabs/SmartSim/pull/327 -.. _SmartSim-PR325: https://github.com/CrayLabs/SmartSim/pull/325 -.. _SmartSim-PR324: https://github.com/CrayLabs/SmartSim/pull/324 -.. _SmartSim-PR322: https://github.com/CrayLabs/SmartSim/pull/322 -.. _SmartSim-PR321: https://github.com/CrayLabs/SmartSim/pull/321 -.. _SmartSim-PR311: https://github.com/CrayLabs/SmartSim/pull/311 -.. _SmartSim-PR308: https://github.com/CrayLabs/SmartSim/pull/308 - - -0.5.0 ------ - -Released on 6 July, 2023 - -Description - -A full list of changes and detailed notes can be found below: - -- Update SmartRedis dependency to v0.4.1 -- Fix tests for db models and scripts -- Fix add_ml_model() and add_script() documentation, tests, and code -- Remove `requirements.txt` and other places where dependencies were defined -- Replace `limit_app_cpus` with `limit_db_cpus` for co-located orchestrators -- Remove wait time associated with Experiment launch summary -- Update and rename Redis conf file -- Migrate from redis-py-cluster to redis-py -- Update full test suite to not require a TF wheel at test time -- Update doc strings -- Remove deprecated code -- Relax the coloredlogs version -- Update Fortran tutorials for SmartRedis -- Add support for multiple network interface binding in Orchestrator and Colocated DBs -- Add typehints and static analysis - -Detailed notes - -- Updates SmartRedis to the most current release (SmartSim-PR316_) -- Fixes and enhancements to documentation (SmartSim-PR317_, SmartSim-PR314_, SmartSim-PR287_) -- Various fixes and enhancements to the test suite (SmartSim-PR315_, SmartSim-PR312_, SmartSim-PR310_, SmartSim-PR302_, SmartSim-PR283_) -- Fix a defect in the tests related to database models and scripts that was - causing key collisions when testing on workload managers (SmartSim-PR313_) -- Remove `requirements.txt` and other places where dependencies were defined. (SmartSim-PR307_) -- Fix defect where dictionaries used to create run settings can be changed - unexpectedly due to copy-by-ref (SmartSim-PR305_) -- The underlying code for Model.add_ml_model() and Model.add_script() was fixed - to correctly handle multi-GPU configurations. Tests were updated to run on - non-local launchers. Documentation was updated and fixed. Also, the default - testing interface has been changed to lo instead of ipogif. (SmartSim-PR304_) -- Typehints have been added. A makefile target `make check-mypy` executes static - analysis with mypy. (SmartSim-PR295_, SmartSim-PR301_, SmartSim-PR303_) -- Replace `limit_app_cpus` with `limit_db_cpus` for co-located orchestrators. - This resolves some incorrect behavior/assumptions about how the application - would be pinned. Instead, users should directly specify the binding options in - their application using the options appropriate for their launcher (SmartSim-PR306_) -- Simplify code in `random_permutations` parameter generation strategy (SmartSim-PR300_) -- Remove wait time associated with Experiment launch summary (SmartSim-PR298_) -- Update Redis conf file to conform with Redis v7.0.5 conf file (SmartSim-PR293_) -- Migrate from redis-py-cluster to redis-py for cluster status checks (SmartSim-PR292_) -- Update full test suite to no longer require a tensorflow wheel to be available at test time. (SmartSim-PR291_) -- Correct spelling of colocated in doc strings (SmartSim-PR290_) -- Deprecated launcher-specific orchestrators, constants, and ML - utilities were removed. (SmartSim-PR289_) -- Relax the coloredlogs version to be greater than 10.0 (SmartSim-PR288_) -- Update the Github Actions runner image from `macos-10.15`` to `macos-12``. The - former began deprecation in May 2022 and was finally removed in May 2023. (SmartSim-PR285_) -- The Fortran tutorials had not been fully updated to show how to handle - return/error codes. These have now all been updated. (SmartSim-PR284_) -- Orchestrator and Colocated DB now accept a list of interfaces to bind to. The - argument name is still `interface` for backward compatibility reasons. (SmartSim-PR281_) -- Typehints have been added to public APIs. A makefile target to execute static - analysis with mypy is available `make check-mypy`. (SmartSim-PR295_) - -.. _SmartSim-PR317: https://github.com/CrayLabs/SmartSim/pull/317 -.. _SmartSim-PR316: https://github.com/CrayLabs/SmartSim/pull/316 -.. _SmartSim-PR315: https://github.com/CrayLabs/SmartSim/pull/314 -.. _SmartSim-PR314: https://github.com/CrayLabs/SmartSim/pull/314 -.. _SmartSim-PR313: https://github.com/CrayLabs/SmartSim/pull/313 -.. _SmartSim-PR312: https://github.com/CrayLabs/SmartSim/pull/312 -.. _SmartSim-PR310: https://github.com/CrayLabs/SmartSim/pull/310 -.. _SmartSim-PR307: https://github.com/CrayLabs/SmartSim/pull/307 -.. _SmartSim-PR306: https://github.com/CrayLabs/SmartSim/pull/306 -.. _SmartSim-PR305: https://github.com/CrayLabs/SmartSim/pull/305 -.. _SmartSim-PR304: https://github.com/CrayLabs/SmartSim/pull/304 -.. _SmartSim-PR303: https://github.com/CrayLabs/SmartSim/pull/303 -.. _SmartSim-PR302: https://github.com/CrayLabs/SmartSim/pull/302 -.. _SmartSim-PR301: https://github.com/CrayLabs/SmartSim/pull/301 -.. _SmartSim-PR300: https://github.com/CrayLabs/SmartSim/pull/300 -.. _SmartSim-PR298: https://github.com/CrayLabs/SmartSim/pull/298 -.. _SmartSim-PR295: https://github.com/CrayLabs/SmartSim/pull/295 -.. _SmartSim-PR293: https://github.com/CrayLabs/SmartSim/pull/293 -.. _SmartSim-PR292: https://github.com/CrayLabs/SmartSim/pull/292 -.. _SmartSim-PR291: https://github.com/CrayLabs/SmartSim/pull/291 -.. _SmartSim-PR290: https://github.com/CrayLabs/SmartSim/pull/290 -.. _SmartSim-PR289: https://github.com/CrayLabs/SmartSim/pull/289 -.. _SmartSim-PR288: https://github.com/CrayLabs/SmartSim/pull/288 -.. _SmartSim-PR287: https://github.com/CrayLabs/SmartSim/pull/287 -.. _SmartSim-PR285: https://github.com/CrayLabs/SmartSim/pull/285 -.. _SmartSim-PR284: https://github.com/CrayLabs/SmartSim/pull/284 -.. _SmartSim-PR283: https://github.com/CrayLabs/SmartSim/pull/283 -.. _SmartSim-PR281: https://github.com/CrayLabs/SmartSim/pull/281 - -0.4.2 ------ - -Released on April 12, 2023 - -Description - -This release of SmartSim had a focus on polishing and extending exiting -features already provided by SmartSim. Most notably, this release provides -support to allow users to colocate their models with an orchestrator using -Unix domain sockets and support for launching models as batch jobs. - -Additionally, SmartSim has updated its tool chains to provide a better user -experience. Notably, SmarSim can now be used with Python 3.10, Redis 7.0.5, and -RedisAI 1.2.7. Furthermore, SmartSim now utilizes SmartRedis's aggregation lists to -streamline the use and extension of ML data loaders, making working with popular -machine learning frameworks in SmartSim a breeze. - -A full list of changes and detailed notes can be found below: - -- Add support for colocating an orchestrator over UDS -- Add support for Python 3.10, deprecate support for Python 3.7 and RedisAI 1.2.3 -- Drop support for Ray -- Update ML data loaders to make use of SmartRedis's aggregation lists -- Allow for models to be launched independently as batch jobs -- Update to current version of Redis to 7.0.5 -- Add support for RedisAI 1.2.7, pyTorch 1.11.0, Tensorflow 2.8.0, ONNXRuntime 1.11.1 -- Fix bug in colocated database entrypoint when loading PyTorch models -- Fix test suite behavior with environment variables - -Detailed Notes - -- Running some tests could result in some SmartSim-specific environment variables to be set. Such environment variables are now reset - after each test execution. Also, a warning for environment variable usage in Slurm was added, to make the user aware in case an environment - variable will not be assigned the desired value with `--export`. (SmartSim-PR270_) -- The PyTorch and TensorFlow data loaders were update to make use of aggregation lists. This breaks their API, but makes them easier to use. (SmartSim-PR264_) -- The support for Ray was dropped, as its most recent versions caused problems when deployed through SmartSim. - We plan to release a separate add-on library to accomplish the same results. If - you are interested in getting the Ray launch functionality back in your workflow, please get in touch with us! (SmartSim-PR263_) -- Update from Redis version 6.0.8 to 7.0.5. (SmartSim-PR258_) -- Adds support for Python 3.10 without the ONNX machine learning backend. Deprecates support for - Python 3.7 as it will stop receiving security updates. Deprecates support for RedisAI 1.2.3. - Update the build process to be able to correctly fetch supported dependencies. If a user - attempts to build an unsupported dependency, an error message is shown highlighting the - discrepancy. (SmartSim-PR256_) -- Models were given a `batch_settings` attribute. When launching a model through `Experiment.start` - the `Experiment` will first check for a non-nullish value at that attribute. If the check is - satisfied, the `Experiment` will attempt to wrap the underlying run command in a batch job using - the object referenced at `Model.batch_settings` as the batch settings for the job. If the check - is not satisfied, the `Model` is launched in the traditional manner as a job step. (SmartSim-PR245_) -- Fix bug in colocated database entrypoint stemming from uninitialized variables. This bug affects PyTorch models being loaded into the database. (SmartSim-PR237_) -- The release of RedisAI 1.2.7 allows us to update support for recent versions of PyTorch, Tensorflow, and ONNX (SmartSim-PR234_) -- Make installation of correct Torch backend more reliable according to instruction from PyTorch -- In addition to TCP, add UDS support for colocating an orchestrator with models. Methods - `Model.colocate_db_tcp` and `Model.colocate_db_uds` were added to expose this functionality. - The `Model.colocate_db` method remains and uses TCP for backward compatibility (SmartSim-PR246_) - -.. _SmartSim-PR270: https://github.com/CrayLabs/SmartSim/pull/270 -.. _SmartSim-PR264: https://github.com/CrayLabs/SmartSim/pull/264 -.. _SmartSim-PR263: https://github.com/CrayLabs/SmartSim/pull/263 -.. _SmartSim-PR258: https://github.com/CrayLabs/SmartSim/pull/258 -.. _SmartSim-PR256: https://github.com/CrayLabs/SmartSim/pull/256 -.. _SmartSim-PR246: https://github.com/CrayLabs/SmartSim/pull/246 -.. _SmartSim-PR245: https://github.com/CrayLabs/SmartSim/pull/245 -.. _SmartSim-PR237: https://github.com/CrayLabs/SmartSim/pull/237 -.. _SmartSim-PR234: https://github.com/CrayLabs/SmartSim/pull/234 - - -0.4.1 ------ - -Released on June 24, 2022 - -Description: -This release of SmartSim introduces a new experimental feature to help make -SmartSim workflows more portable: the ability to run simulations models in a -container via Singularity. This feature has been tested on a small number of -platforms and we encourage users to provide feedback on its use. - -We have also made improvements in a variety of areas: new utilities to load -scripts and machine learning models into the database directly from SmartSim -driver scripts and install-time choice to use either `KeyDB` or `Redis` for the -Orchestrator. The `RunSettings` API is now more consistent across subclasses. Another -key focus of this release was to aid new SmartSim users by including more -extensive tutorials and improving the documentation. The docker image containing -the SmartSim tutorials now also includes a tutorial on online training. - - -Launcher improvements - - - New methods for specifying `RunSettings` parameters (SmartSim-PR166_) (SmartSim-PR170_) - - Better support for `mpirun`, `mpiexec`, and `orterun` as launchers (SmartSim-PR186_) - - Experimental: add support for running models via Singularity (SmartSim-PR204_) - -Documentation and tutorials - - - Tutorial updates (SmartSim-PR155_) (SmartSim-PR203_) (SmartSim-PR208_) - - Add SmartSim Zoo info to documentation (SmartSim-PR175_) - - New tutorial for demonstrating online training (SmartSim-PR176_) (SmartSim-PR188_) - -General improvements and bug fixes - - - Set models and scripts at the driver level (SmartSim-PR185_) - - Optionally use KeyDB for the orchestrator (SmartSim-PR180_) - - Ability to specify system-level libraries (SmartSim-PR154_) (SmartSim-PR182_) - - Fix the handling of LSF gpus_per_shard (SmartSim-PR164_) - - Fix error when re-running `smart build` (SmartSim-PR165_) - - Fix generator hanging when tagged configuration variables are missing (SmartSim-PR177_) - -Dependency updates - - - CMake version from 3.10 to 3.13 (SmartSim-PR152_) - - Update click to 8.0.2 (SmartSim-PR200_) - -.. _SmartSim-PR152: https://github.com/CrayLabs/SmartSim/pull/152 -.. _SmartSim-PR154: https://github.com/CrayLabs/SmartSim/pull/154 -.. _SmartSim-PR155: https://github.com/CrayLabs/SmartSim/pull/155 -.. _SmartSim-PR164: https://github.com/CrayLabs/SmartSim/pull/164 -.. _SmartSim-PR165: https://github.com/CrayLabs/SmartSim/pull/165 -.. _SmartSim-PR166: https://github.com/CrayLabs/SmartSim/pull/166 -.. _SmartSim-PR170: https://github.com/CrayLabs/SmartSim/pull/170 -.. _SmartSim-PR175: https://github.com/CrayLabs/SmartSim/pull/175 -.. _SmartSim-PR176: https://github.com/CrayLabs/SmartSim/pull/176 -.. _SmartSim-PR177: https://github.com/CrayLabs/SmartSim/pull/177 -.. _SmartSim-PR180: https://github.com/CrayLabs/SmartSim/pull/180 -.. _SmartSim-PR182: https://github.com/CrayLabs/SmartSim/pull/182 -.. _SmartSim-PR185: https://github.com/CrayLabs/SmartSim/pull/185 -.. _SmartSim-PR186: https://github.com/CrayLabs/SmartSim/pull/186 -.. _SmartSim-PR188: https://github.com/CrayLabs/SmartSim/pull/188 -.. _SmartSim-PR200: https://github.com/CrayLabs/SmartSim/pull/200 -.. _SmartSim-PR203: https://github.com/CrayLabs/SmartSim/pull/203 -.. _SmartSim-PR204: https://github.com/CrayLabs/SmartSim/pull/204 -.. _SmartSim-PR208: https://github.com/CrayLabs/SmartSim/pull/208 - -0.4.0 ------ - -Released on Feb 11, 2022 - -Description: -In this release SmartSim continues to promote ease of use. -To this end SmartSim has introduced new portability features -that allow users to abstract away their targeted hardware, -while providing even more compatibility with existing -libraries. - -A new feature, Co-located orchestrator deployments has -been added which provides scalable online inference -capabilities that overcome previous performance limitations -in seperated orchestrator/application deployments. -For more information on advantages of co-located deployments, -see the Orchestrator section of the SmartSim documentation. - -The SmartSim build was significantly improved to increase -customization of build toolchain and the ``smart`` command -line inferface was expanded. - -Additional tweaks and upgrades have also been -made to ensure an optimal experience. Here is a -comprehensive list of changes made in SmartSim 0.4.0. - - -Orchestrator Enhancements: - - - Add Orchestrator Co-location (SmartSim-PR139_) - - Add Orchestrator configuration file edit methods (SmartSim-PR109_) - -Emphasize Driver Script Portability: - - - Add ability to create run settings through an experiment (SmartSim-PR110_) - - Add ability to create batch settings through an experiment (SmartSim-PR112_) - - Add automatic launcher detection to experiment portability functions (SmartSim-PR120_) - -Expand Machine Learning Library Support: - - - Data loaders for online training in Keras/TF and Pytorch (SmartSim-PR115_) (SmartSim-PR140_) - - ML backend versions updated with expanded support for multiple versions (SmartSim-PR122_) - - Launch Ray internally using ``RunSettings`` (SmartSim-PR118_) - - Add Ray cluster setup and deployment to SmartSim (SmartSim-PR50_) - -Expand Launcher Setting Options: - - - Add ability to use base ``RunSettings`` on a Slurm, or PBS launchers (SmartSim-PR90_) - - Add ability to use base ``RunSettings`` on LFS launcher (SmartSim-PR108_) - -Deprecations and Breaking Changes - - - Orchestrator classes combined into single implementation for portability (SmartSim-PR139_) - - ``smartsim.constants`` changed to ``smartsim.status`` (SmartSim-PR122_) - - ``smartsim.tf`` migrated to ``smartsim.ml.tf`` (SmartSim-PR115_) (SmartSim-PR140_) - - TOML configuration option removed in favor of environment variable approach (SmartSim-PR122_) - -General Improvements and Bug Fixes: - - - Improve and extend parameter handling (SmartSim-PR107_) (SmartSim-PR119_) - - Abstract away non-user facing implementation details (SmartSim-PR122_) - - Add various dimensions to the CI build matrix for SmartSim testing (SmartSim-PR130_) - - Add missing functions to LSFSettings API (SmartSim-PR113_) - - Add RedisAI checker for installed backends (SmartSim-PR137_) - - Remove heavy and unnecessary dependencies (SmartSim-PR116_) (SmartSim-PR132_) - - Fix LSFLauncher and LSFOrchestrator (SmartSim-PR86_) - - Fix over greedy Workload Manager Parsers (SmartSim-PR95_) - - Fix Slurm handling of comma-separated env vars (SmartSim-PR104_) - - Fix internal method calls (SmartSim-PR138_) - -Documentation Updates: - - - Updates to documentation build process (SmartSim-PR133_) (SmartSim-PR143_) - - Updates to documentation content (SmartSim-PR96_) (SmartSim-PR129_) (SmartSim-PR136_) (SmartSim-PR141_) - - Update SmartSim Examples (SmartSim-PR68_) (SmartSim-PR100_) - - -.. _SmartSim-PR50: https://github.com/CrayLabs/SmartSim/pull/50 -.. _SmartSim-PR68: https://github.com/CrayLabs/SmartSim/pull/68 -.. _SmartSim-PR86: https://github.com/CrayLabs/SmartSim/pull/86 -.. _SmartSim-PR90: https://github.com/CrayLabs/SmartSim/pull/90 -.. _SmartSim-PR95: https://github.com/CrayLabs/SmartSim/pull/95 -.. _SmartSim-PR96: https://github.com/CrayLabs/SmartSim/pull/96 -.. _SmartSim-PR100: https://github.com/CrayLabs/SmartSim/pull/100 -.. _SmartSim-PR104: https://github.com/CrayLabs/SmartSim/pull/104 -.. _SmartSim-PR107: https://github.com/CrayLabs/SmartSim/pull/107 -.. _SmartSim-PR108: https://github.com/CrayLabs/SmartSim/pull/108 -.. _SmartSim-PR109: https://github.com/CrayLabs/SmartSim/pull/109 -.. _SmartSim-PR110: https://github.com/CrayLabs/SmartSim/pull/110 -.. _SmartSim-PR112: https://github.com/CrayLabs/SmartSim/pull/112 -.. _SmartSim-PR113: https://github.com/CrayLabs/SmartSim/pull/113 -.. _SmartSim-PR115: https://github.com/CrayLabs/SmartSim/pull/115 -.. _SmartSim-PR116: https://github.com/CrayLabs/SmartSim/pull/116 -.. _SmartSim-PR118: https://github.com/CrayLabs/SmartSim/pull/118 -.. _SmartSim-PR119: https://github.com/CrayLabs/SmartSim/pull/119 -.. _SmartSim-PR120: https://github.com/CrayLabs/SmartSim/pull/120 -.. _SmartSim-PR122: https://github.com/CrayLabs/SmartSim/pull/122 -.. _SmartSim-PR129: https://github.com/CrayLabs/SmartSim/pull/129 -.. _SmartSim-PR130: https://github.com/CrayLabs/SmartSim/pull/130 -.. _SmartSim-PR132: https://github.com/CrayLabs/SmartSim/pull/132 -.. _SmartSim-PR133: https://github.com/CrayLabs/SmartSim/pull/133 -.. _SmartSim-PR136: https://github.com/CrayLabs/SmartSim/pull/136 -.. _SmartSim-PR137: https://github.com/CrayLabs/SmartSim/pull/137 -.. _SmartSim-PR138: https://github.com/CrayLabs/SmartSim/pull/138 -.. _SmartSim-PR139: https://github.com/CrayLabs/SmartSim/pull/139 -.. _SmartSim-PR140: https://github.com/CrayLabs/SmartSim/pull/140 -.. _SmartSim-PR141: https://github.com/CrayLabs/SmartSim/pull/141 -.. _SmartSim-PR143: https://github.com/CrayLabs/SmartSim/pull/143 - - -0.3.2 ------ - -Released on August 10, 2021 - -Description: - - - Upgraded RedisAI backend to 1.2.3 (SmartSim-PR69_) - - PyTorch 1.7.1, TF 2.4.2, and ONNX 1.6-7 (SmartSim-PR69_) - - LSF launcher for IBM machines (SmartSim-PR62_) - - Improved code coverage by adding more unit tests (SmartSim-PR53_) - - Orchestrator methods to get address and check status (SmartSim-PR60_) - - Added Manifest object that tracks deployables in Experiments (SmartSim-PR61_) - - Bug fixes (SmartSim-PR52_) (SmartSim-PR58_) (SmartSim-PR67_) (SmartSim-PR73_) - - Updated documentation and examples (SmartSim-PR51_) (SmartSim-PR57_) (SmartSim-PR71_) - - Improved IP address aquisition (SmartSim-PR72_) - - Binding database to network interfaces - -.. _SmartSim-PR51: https://github.com/CrayLabs/SmartSim/pull/51 -.. _SmartSim-PR52: https://github.com/CrayLabs/SmartSim/pull/52 -.. _SmartSim-PR53: https://github.com/CrayLabs/SmartSim/pull/53 -.. _SmartSim-PR57: https://github.com/CrayLabs/SmartSim/pull/57 -.. _SmartSim-PR58: https://github.com/CrayLabs/SmartSim/pull/58 -.. _SmartSim-PR60: https://github.com/CrayLabs/SmartSim/pull/60 -.. _SmartSim-PR61: https://github.com/CrayLabs/SmartSim/pull/61 -.. _SmartSim-PR62: https://github.com/CrayLabs/SmartSim/pull/62 -.. _SmartSim-PR67: https://github.com/CrayLabs/SmartSim/pull/67 -.. _SmartSim-PR69: https://github.com/CrayLabs/SmartSim/pull/69 -.. _SmartSim-PR71: https://github.com/CrayLabs/SmartSim/pull/71 -.. _SmartSim-PR72: https://github.com/CrayLabs/SmartSim/pull/72 -.. _SmartSim-PR73: https://github.com/CrayLabs/SmartSim/pull/73 - -0.3.1 ------ - -Released on May 5, 2021 - -Description: -This release was dedicated to making the install process -easier. SmartSim can be installed from PyPI now and the -``smart`` cli tool makes installing the machine learning -runtimes much easier. - - - Pip install (SmartSim-PR42_) - - ``smart`` cli tool for ML backends (SmartSim-PR42_) - - Build Documentation for updated install (SmartSim-PR43_) - - Migrate from Jenkins to Github Actions CI (SmartSim-PR42_) - - Bug fix for setup.cfg (SmartSim-PR35_) - -.. _SmartSim-PR43: https://github.com/CrayLabs/SmartSim/pull/43 -.. _SmartSim-PR42: https://github.com/CrayLabs/SmartSim/pull/42 -.. _SmartSim-PR35: https://github.com/CrayLabs/SmartSim/pull/35 - -0.3.0 ------ - -Released on April 1, 2021 - -Description: - - - initial 0.3.0 (first public) release of SmartSim - - ---------------------------------------------------------------- - -.. _sr_changelog: - -SmartRedis -========== - -.. include:: ../smartredis/doc/changelog.rst - :start-line: 3 diff --git a/doc/dragon.rst b/doc/dragon.rst new file mode 100644 index 000000000..abc761014 --- /dev/null +++ b/doc/dragon.rst @@ -0,0 +1,154 @@ +****** +Dragon +****** + +============ +Introduction +============ + +`Dragon `_ is a +composable distributed run-time targeting HPC workflows. In SmartSim, +Dragon can be used as a launcher, within a Slurm or PBS allocation or batch job. + +.. note:: + The Dragon launcher is at an early development stage and should be considered + a prototype implementation. Please report any issue you encounter while using + it and provide feedback about missing features you would like to see + implemented. + +===== +Usage +===== +To be able to use Dragon, you will have to install it in your current Python +environment. This can be done as part of the ``smart build`` step, as explained +in `_dragon_install`. + +Once installed, Dragon can be selected as launcher when creating an ``Experiment``: + +.. code-block:: python + + exp = Experiment(name="dragon-example", launcher="dragon") + + +Dragon has its own run settings class, ``DragonRunSettings``, +which can be used to specify nodes and tasks per node for a ``Model``, +for example, continuing from the previous example: + +.. code-block:: python + + rs = exp.create_run_settings(exe="mpi_app", + exe_args=["--option", "value"], + env_vars={"MYVAR": "VALUE"}) + rs.set_nodes(4) + rs.set_tasks_per_node(3) + mpi_app = exp.create_model("MPI_APP", run_settings=rs) + exp.start(mpi_app) + + +All types of SmartSim entities are supported, including ``Ensemble`` +and ``Orchestrator``, and the underlying Dragon launcher is completely +transparent to the user. In the next sections, we will explain +how Dragon is integrated into SmartSim. + +================= +The Dragon Server +================= + +Dragon can start processes on any resource available within an allocation. +To do this, the so-called Dragon infrastructure needs to be started. SmartSim +instantiates the Dragon infrastructure whenever a ``Model`` needs to be started +and will keep it up and running until the parent ``Experiment`` is active. +To be able to interact with processes started through Dragon, +SmartSim spins up a command server in the Dragon infrastructure and sends commands +to it every time a process needs to be started or stopped, and to query its status. +We call this server the `Dragon Server`, and its lifecycle is managed by SmartSim. + + +Sharing the Dragon Server across Experiments +============================================ + +Currently, SmartSim only supports one Dragon server per allocation. For this reason, +if multiple ``Experiment``s need to run in the same allocation, the Dragon server needs +to be shared among them. By default, the server is started from a subdirectory of the +``Experiment`` path. To make it possible to share the server, it is possible to +specify a path from which the Server should be started through the environment variable +``SMARTSIM_DRAGON_SERVER_PATH``: every ``Experiment`` will look for the running +server in the given path and only start a new server instance if there is none running. + +Dragon's High-Speed Transport Agents +==================================== + +On systems where the HPE Slingshot interconnect is available, Dragon can use +Higs-Speed Transport Agents (HSTA) to send internal messages. This is the default +choice for messages sent in the Dragon infrastructure started by SmartSim. On +systems where the HPE Slingshot interconnect is not available, TCP agents must be +used. To specify TCP agents, the environment variable ``SMARTSIM_DRAGON_TRANSPORT`` +must be set to ``tcp`` prior to the ``Experiment`` execution. + +============ +Communcation +============ + +SmartSim and the Dragon Server communicate using `ZeroMQ `_. + +As with any communication protocol, some timeouts for send and receive must be defined. +SmartSim sets some default timeouts that have been tested to work on most available systems, +but if you see failed communication attempts, you may want to try to adjust the +timeouts by setting the corresponding environment variable. +The timeouts are given in milliseconds and they are defined as follows: + +- server start-up timeout: the time waited by the SmartSim ``Experiment`` when the server + is first started. This timeout must account for the time it takes Dragon to set up the + infrastructure, which depends on the system's workload manager response time. + Defaults to ``"300000"`` (i.e. five minutes) and can be overridden with the environment variable + ``SMARTSIM_DRAGON_STARTUP_TIMEOUT``. + +- server send and receive timeout: the time waited by SmartSim and the Dragon server to send or + receive a message. Defaults to ``"30000"`` (i.e. 30 seconds) and can be overridden with the + environment variable ``SMARTSIM_DRAGON_TIMEOUT``. + +Setting any timeout to ``"-1"`` will result in infinite waiting time, which means that the +execution will block until the communication is completed, and hang indefinitely if something went wrong. + + +All communications are secured with elliptic curve cryptography, +and the key-pairs needed by the protocol are created by SmartSim and stored in the +user's home directory, unless another path is specified through the environment variable +``SMARTSIM_KEY_PATH``. + + +..dragon_known_issues_: + +============ +Known issues +============ + +As previosuly remarked, the SmartSim-Dragon integration is at an early development stage +and there are some known issues that can lead to errors during runs. + +- *Incomplete cleanup of Dragon resources*: when SmartSim exits, it ensures that the dragon + infrastructure is correctly shut down, so that all the associated resources (such as + shared memory segments) are cleaned up and all processes are terminated. Nevertheless, + in some rare cases, when the execution is interrupted abruptly (for example by terminating + SmartSim with ``SIGKILL``), the cleanup process can be incomplete and processes + such as the Dragon overlay network will remain active on the node where SmartSim was + executed (which could be a login node, especially on Slurm systems). If that happens + you can run + + .. code-block:: + + smart teardown --dragon + + which will kill all Dragon related processes, return shared memory segments, but also + kill all Python processes (associated to your user name). + +- *Dragon server not starting*: this can happen because of two main reasons + + 1. HSTA not available on the system: try setting the environment variable + ``SMARTSIM_DRAGON_TRANSPORT`` to ``tcp`` + 2. System or Workload Manager too busy: try setting the environment variable + ``SMARTSIM_DRAGON_STARTUP_TIMEOUT`` to a larger value or to ``"-1"``. + + +- *MPI-based applications hanging*: to run MPI-based applications on Dragon, PMI must be + available on the system. This is a current limitation and is actively been worked on. \ No newline at end of file diff --git a/doc/experiment.rst b/doc/experiment.rst index 73ba08812..91b14cd6c 100644 --- a/doc/experiment.rst +++ b/doc/experiment.rst @@ -52,14 +52,15 @@ SmartSim supports launching AI-enabled workflows on a wide variety of systems, i Linux machine or on HPC machines with a job scheduler (e.g. Slurm, PBS Pro, and LSF). When creating a SmartSim ``Experiment``, the user has the opportunity to specify the `launcher` type or defer to automatic `launcher` selection. `Launcher` selection determines how SmartSim translates entity configurations into system calls to launch, -manage, and monitor. Currently, SmartSim supports 5 `launchers`: +manage, and monitor. Currently, SmartSim supports 7 `launcher` options: 1. ``local`` **[default]**: for single-node, workstation, or laptop 2. ``slurm``: for systems using the Slurm scheduler 3. ``pbs``: for systems using the PBS Pro scheduler 4. ``pals``: for systems using the PALS scheduler 5. ``lsf``: for systems using the LSF scheduler -6. ``auto``: have SmartSim auto-detect the launcher to use +6. ``auto``: have SmartSim auto-detect the launcher to use from the above +7. ``dragon``: if Dragon was installed in the current Python environment If the systems `launcher` cannot be found or no `launcher` argument is provided, the default value of `"local"` will be assigned which will start all ``Experiment`` launched entities on the diff --git a/doc/index.rst b/doc/index.rst index 7e7d9c2d6..4c64712b2 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -37,6 +37,7 @@ orchestrator ss_logger ml_features + dragon api/smartsim_api .. toctree:: diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst index 905519f6f..b7428a60c 100644 --- a/doc/installation_instructions/basic.rst +++ b/doc/installation_instructions/basic.rst @@ -237,6 +237,28 @@ to building SmartSim with GPU support is to specify a different ``device`` backends look for the CUDA Toolkit and cuDNN libraries. Please see the :ref:`Platform Installation Section ` section for guidance. + +.. _dragon_install: + +Dragon Install +-------------- + +`Dragon `_ is +an HPC-native library for distributed computing. SmartSim can use Dragon as a +launcher on systems with Slurm or PBS as schedulers. To install the correct +version of Dragon, you can add the ``--dragon`` option to ``smart build``. +For example, to install dragon alongside the RedisAI CPU backends, you can run + +.. code-block:: bash + + # run one of the following + smart build --device cpu --dragon # install Dragon, PT and TF for cpu + smart build --device cpu --onnx --dragon # install Dragon and all backends (PT, TF, ONNX) on cpu + +.. note:: + Dragon is only supported on Linux systems. For further information, you + can read the dedicated documentation section. + ========== SmartRedis ========== @@ -300,7 +322,7 @@ source remains at the site of the clone instead of in site-packages. pip install -e .[dev,ml] # for bash users pip install -e .\[dev,ml\] # for zsh users -Use the now installed ``smart`` cli to install the machine learning runtimes. +Use the now installed ``smart`` cli to install the machine learning runtimes and dragon. .. tabs:: @@ -309,8 +331,8 @@ Use the now installed ``smart`` cli to install the machine learning runtimes. .. code-block:: bash # run one of the following - smart build --device cpu --onnx # install with cpu-only support - smart build --device gpu --onnx # install with both cpu and gpu support + smart build --device cpu --onnx --dragon # install with cpu-only support + smart build --device gpu --onnx --dragon # install with both cpu and gpu support .. tab:: MacOS (Intel x64) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 33f4270ee..2b95ff921 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -166,7 +166,7 @@ def dragon_server_timeout(self) -> int: @property def dragon_server_startup_timeout(self) -> int: - return int(os.getenv("SMARTSIM_DRAGON_STARTUP_TIMEOUT", "-1")) + return int(os.getenv("SMARTSIM_DRAGON_STARTUP_TIMEOUT", "300000")) @property def dragon_transport(self) -> str: diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 065a36d44..87c15893e 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -222,6 +222,9 @@ def connect_to_dragon(self) -> None: if self._dragon_server_path is None: raise SmartSimError("Path to Dragon server not set.") + logger.info("Establishing connection with Dragon server" + "or starting a new one...") + path = _resolve_dragon_path(self._dragon_server_path) self._connect_to_existing_server(path) diff --git a/smartsim/settings/dragonRunSettings.py b/smartsim/settings/dragonRunSettings.py index d888d867c..b8baa4708 100644 --- a/smartsim/settings/dragonRunSettings.py +++ b/smartsim/settings/dragonRunSettings.py @@ -39,22 +39,19 @@ def __init__( self, exe: str, exe_args: t.Optional[t.Union[str, t.List[str]]] = None, - run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None, env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, - alloc: t.Optional[str] = None, **kwargs: t.Any, ) -> None: """Initialize run parameters for a Dragon process ``DragonRunSettings`` should only be used on systems where Dragon - is available. + is available and installed in the current environment. If an allocation is specified, the instance receiving these run parameters will launch on that allocation. :param exe: executable to run :param exe_args: executable arguments, defaults to None - :param run_args: srun arguments without dashes, defaults to None :param env_vars: environment variables for job, defaults to None :param alloc: allocation ID if running on existing alloc, defaults to None """ @@ -62,12 +59,9 @@ def __init__( exe, exe_args, run_command="", - run_args=run_args, env_vars=env_vars, **kwargs, ) - self.alloc = alloc - self.mpmd: t.List[RunSettings] = [] def set_nodes(self, nodes: int) -> None: """Set the number of nodes @@ -76,86 +70,9 @@ def set_nodes(self, nodes: int) -> None: """ self.run_args["nodes"] = nodes - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: - """Specify the hostlist for this job - - :param host_list: hosts to launch on - :raises TypeError: if not str or list of str - """ - if isinstance(host_list, str): - host_list = [host_list.strip()] - if not isinstance(host_list, list): - raise TypeError("host_list argument must be a list of strings") - if not all(isinstance(host, str) for host in host_list): - raise TypeError("host_list argument must be list of strings") - self.run_args["nodelist"] = ",".join(host_list) - - def set_hostlist_from_file(self, file_path: str) -> None: - """Use the contents of a file to set the node list - - :param file_path: Path to the hostlist file - """ - self.run_args["nodefile"] = file_path - - def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: - """Specify a list of hosts to exclude for launching this job - - :param host_list: hosts to exclude - :raises TypeError: - """ - if isinstance(host_list, str): - host_list = [host_list.strip()] - if not isinstance(host_list, list): - raise TypeError("host_list argument must be a list of strings") - if not all(isinstance(host, str) for host in host_list): - raise TypeError("host_list argument must be list of strings") - self.run_args["exclude"] = ",".join(host_list) - - def set_cpus_per_task(self, cpus_per_task: int) -> None: - """Set the number of cpus to use per task - - :param num_cpus: number of cpus to use per task - """ - self.run_args["cpus-per-task"] = cpus_per_task - - def set_tasks(self, tasks: int) -> None: - """Set the number of tasks for this job - - :param tasks: number of tasks - """ - self.run_args["ntasks"] = tasks - def set_tasks_per_node(self, tasks_per_node: int) -> None: """Set the number of tasks for this job :param tasks_per_node: number of tasks per node """ self.run_args["tasks-per-node"] = tasks_per_node - - def set_memory_per_node(self, memory_per_node: int) -> None: - """Specify the real memory required per node - - :param memory_per_node: Amount of memory per node in megabytes - """ - self.run_args["mem"] = f"{memory_per_node}M" - - def set_verbose_launch(self, verbose: bool) -> None: - """Set the job to run in verbose mode - - This sets ``--verbose`` - - :param verbose: Whether the job should be run verbosely - """ - if verbose: - self.run_args["verbose"] = None - else: - self.run_args.pop("verbose", None) - - def set_walltime(self, walltime: str) -> None: - """Set the walltime of the job - - format = "HH:MM:SS" - - :param walltime: wall time - """ - self.run_args["time"] = str(walltime) From e189b0720920c37a72366b4b3cbf773fab6bb91e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 9 May 2024 17:47:44 +0200 Subject: [PATCH 087/101] Add tech detail to docs --- doc/dragon.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/dragon.rst b/doc/dragon.rst index abc761014..1c26b0495 100644 --- a/doc/dragon.rst +++ b/doc/dragon.rst @@ -150,5 +150,5 @@ and there are some known issues that can lead to errors during runs. ``SMARTSIM_DRAGON_STARTUP_TIMEOUT`` to a larger value or to ``"-1"``. -- *MPI-based applications hanging*: to run MPI-based applications on Dragon, PMI must be - available on the system. This is a current limitation and is actively been worked on. \ No newline at end of file +- *MPI-based applications hanging*: to run MPI-based applications on Dragon, Cray PMI or Cray PALS + must be available on the system. This is a current limitation and is actively been worked on. \ No newline at end of file From f7e985041a5039e0945bc84e7c6972a6f08a6afe Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 9 May 2024 10:53:32 -0500 Subject: [PATCH 088/101] Add dragon test to group A --- tests/test_dragon_backend.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index bf06d93eb..be4babf0d 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -32,6 +32,9 @@ import pytest +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_a + try: import dragon except ImportError: From 57d81935cd6fdfecbb33313c672facdac862b00c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 9 May 2024 18:48:40 +0200 Subject: [PATCH 089/101] Fix for new signature --- smartsim/settings/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index b4de2dc73..5f7fc3fe2 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -185,7 +185,7 @@ def _detect_command(launcher: str) -> str: if launcher == "dragon": return DragonRunSettings( - exe, exe_args, run_args, env_vars, container=container, **kwargs + exe=exe, exe_args=exe_args, env_vars=env_vars, container=container, **kwargs ) # if user specified and supported or auto detection worked From 2cdc7a4010c073c93c8cdf46e18c8e706f79af6f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 9 May 2024 12:07:59 -0500 Subject: [PATCH 090/101] Weird black issue --- smartsim/_core/launcher/dragon/dragonConnector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 87c15893e..f3d9c72d4 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -222,8 +222,9 @@ def connect_to_dragon(self) -> None: if self._dragon_server_path is None: raise SmartSimError("Path to Dragon server not set.") - logger.info("Establishing connection with Dragon server" - "or starting a new one...") + logger.info( + "Establishing connection with Dragon server or starting a new one..." + ) path = _resolve_dragon_path(self._dragon_server_path) From e929e614ad91a9bd62fad73f9fbddaab0b7f312a Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 9 May 2024 19:19:22 +0200 Subject: [PATCH 091/101] Doc touchups --- doc/dragon.rst | 22 +++++++++++----------- doc/experiment.rst | 2 +- doc/installation_instructions/basic.rst | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/doc/dragon.rst b/doc/dragon.rst index 1c26b0495..f75280242 100644 --- a/doc/dragon.rst +++ b/doc/dragon.rst @@ -21,7 +21,7 @@ Usage ===== To be able to use Dragon, you will have to install it in your current Python environment. This can be done as part of the ``smart build`` step, as explained -in `_dragon_install`. +in :ref:`Dragon Install `. Once installed, Dragon can be selected as launcher when creating an ``Experiment``: @@ -68,11 +68,12 @@ Sharing the Dragon Server across Experiments ============================================ Currently, SmartSim only supports one Dragon server per allocation. For this reason, -if multiple ``Experiment``s need to run in the same allocation, the Dragon server needs +if multiple ``Experiments`` need to run in the same allocation, the Dragon server needs to be shared among them. By default, the server is started from a subdirectory of the ``Experiment`` path. To make it possible to share the server, it is possible to -specify a path from which the Server should be started through the environment variable -``SMARTSIM_DRAGON_SERVER_PATH``: every ``Experiment`` will look for the running +specify a path from which the Server should be started. This can be done +by setting the environment variable +``SMARTSIM_DRAGON_SERVER_PATH`` to an existing path: every ``Experiment`` will look for the running server in the given path and only start a new server instance if there is none running. Dragon's High-Speed Transport Agents @@ -91,19 +92,19 @@ Communcation SmartSim and the Dragon Server communicate using `ZeroMQ `_. -As with any communication protocol, some timeouts for send and receive must be defined. +As with any communication protocol, some timeouts for send and receive operations must be defined. SmartSim sets some default timeouts that have been tested to work on most available systems, but if you see failed communication attempts, you may want to try to adjust the timeouts by setting the corresponding environment variable. The timeouts are given in milliseconds and they are defined as follows: -- server start-up timeout: the time waited by the SmartSim ``Experiment`` when the server +- *server start-up timeout*: the time waited by the SmartSim ``Experiment`` when the server is first started. This timeout must account for the time it takes Dragon to set up the infrastructure, which depends on the system's workload manager response time. Defaults to ``"300000"`` (i.e. five minutes) and can be overridden with the environment variable ``SMARTSIM_DRAGON_STARTUP_TIMEOUT``. -- server send and receive timeout: the time waited by SmartSim and the Dragon server to send or +- *server send and receive timeout*: the time waited by SmartSim and the Dragon server to send or receive a message. Defaults to ``"30000"`` (i.e. 30 seconds) and can be overridden with the environment variable ``SMARTSIM_DRAGON_TIMEOUT``. @@ -117,14 +118,14 @@ user's home directory, unless another path is specified through the environment ``SMARTSIM_KEY_PATH``. -..dragon_known_issues_: +.. _dragon_known_issues: ============ Known issues ============ As previosuly remarked, the SmartSim-Dragon integration is at an early development stage -and there are some known issues that can lead to errors during runs. +and there are some known issues that can lead to unexpected behavior during runs. - *Incomplete cleanup of Dragon resources*: when SmartSim exits, it ensures that the dragon infrastructure is correctly shut down, so that all the associated resources (such as @@ -149,6 +150,5 @@ and there are some known issues that can lead to errors during runs. 2. System or Workload Manager too busy: try setting the environment variable ``SMARTSIM_DRAGON_STARTUP_TIMEOUT`` to a larger value or to ``"-1"``. - - *MPI-based applications hanging*: to run MPI-based applications on Dragon, Cray PMI or Cray PALS - must be available on the system. This is a current limitation and is actively been worked on. \ No newline at end of file + must be available on the system. This is a current limitation and is actively being worked on. \ No newline at end of file diff --git a/doc/experiment.rst b/doc/experiment.rst index 91b14cd6c..2dd624ac6 100644 --- a/doc/experiment.rst +++ b/doc/experiment.rst @@ -60,7 +60,7 @@ manage, and monitor. Currently, SmartSim supports 7 `launcher` options: 4. ``pals``: for systems using the PALS scheduler 5. ``lsf``: for systems using the LSF scheduler 6. ``auto``: have SmartSim auto-detect the launcher to use from the above -7. ``dragon``: if Dragon was installed in the current Python environment +7. ``dragon``: if Dragon was installed in the current Python environment, see :ref:`Dragon Install ` If the systems `launcher` cannot be found or no `launcher` argument is provided, the default value of `"local"` will be assigned which will start all ``Experiment`` launched entities on the diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst index b7428a60c..02c17e1fd 100644 --- a/doc/installation_instructions/basic.rst +++ b/doc/installation_instructions/basic.rst @@ -257,7 +257,7 @@ For example, to install dragon alongside the RedisAI CPU backends, you can run .. note:: Dragon is only supported on Linux systems. For further information, you - can read the dedicated documentation section. + can read :ref:`the dedicated documentation page `. ========== SmartRedis From 4cd33de4614743a7a29a622e76b255280f26077e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 10 May 2024 00:58:23 +0200 Subject: [PATCH 092/101] Post-merge fix --- .../_core/launcher/dragon/dragonConnector.py | 50 +++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index f3d9c72d4..acb390606 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -83,6 +83,7 @@ def __init__(self) -> None: self._dragon_head_pid: t.Optional[int] = None self._dragon_server_path = config.dragon_server_path logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") + self._env_vars: t.Dict[str, str] = {} if self._dragon_server_path is None: raise SmartSimError( "DragonConnector could not find the dragon server path. " @@ -213,6 +214,47 @@ def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: return connector_socket + def _load_persisted_env(self) -> t.Dict[str, str]: + """Load key-value pairs from a .env file created during dragon installation""" + if self._env_vars: + # use previously loaded env vars. + return self._env_vars + + config = get_config() + + if not config.dragon_dotenv.exists(): + self._env_vars = {} + return self._env_vars + + with open(config.dragon_dotenv, encoding="utf-8") as dot_env: + for kvp in dot_env.readlines(): + split = kvp.strip().split("=", maxsplit=1) + key, value = split[0], split[-1] + self._env_vars[key] = value + + return self._env_vars + + def _merge_persisted_env(self, current_env: t.Dict[str, str]) -> t.Dict[str, str]: + """Combine the current environment variable set with the dragon .env by adding + Dragon-specific values and prepending any new values to existing keys""" + # ensure we start w/a complete env from current env state + merged_env: t.Dict[str, str] = {**current_env} + + # copy all the values for dragon straight into merged_env + merged_env.update( + {k: v for k, v in self._env_vars.items() if k.startswith("DRAGON")} + ) + + # prepend dragon env updates into existing env vars + for key, value in self._env_vars.items(): + if not key.startswith("DRAGON"): + if current_value := current_env.get(key, None): + # when a key is not dragon specific, don't overwrite the current + # value. instead, prepend the value dragon needs to/current env + value = f"{value}:{current_value}" + merged_env[key] = value + return merged_env + def connect_to_dragon(self) -> None: config = get_config() with DRG_LOCK: @@ -262,12 +304,14 @@ def connect_to_dragon(self) -> None: dragon_out_file = path / "dragon_head.out" dragon_err_file = path / "dragon_head.err" + self._load_persisted_env() + merged_env = self._merge_persisted_env(os.environ.copy()) + merged_env.update({"PYTHONUNBUFFERED": "1"}) + with ( open(dragon_out_file, "w", encoding="utf-8") as dragon_out, open(dragon_err_file, "w", encoding="utf-8") as dragon_err, ): - current_env = os.environ.copy() - current_env.update({"PYTHONUNBUFFERED": "1"}) logger.debug(f"Starting Dragon environment: {' '.join(cmd)}") # pylint: disable-next=consider-using-with @@ -278,7 +322,7 @@ def connect_to_dragon(self) -> None: stdout=dragon_out.fileno(), cwd=path, shell=False, - env=current_env, + env=merged_env, start_new_session=True, ) From c935a5f1d333777b05777fb8f0ff0436b457a712 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 10 May 2024 01:34:15 +0200 Subject: [PATCH 093/101] Fix wrongly merged test --- tests/test_dragon_launcher.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 529565cd5..316b9ada2 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -470,8 +470,6 @@ def test_load_env_cached_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): # attempt to load and if it doesn't blow up, it used the cached copy - connector = DragonConnector() - loaded_env = connector._load_persisted_env() assert loaded_env From 68a145389c8c1a7f9d8b32c347e453ff6b38b093 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 10 May 2024 10:45:31 -0500 Subject: [PATCH 094/101] Pass dragon env to launched jobs --- smartsim/_core/launcher/dragon/dragonConnector.py | 7 +++++-- smartsim/_core/launcher/dragon/dragonLauncher.py | 8 +++++--- tests/test_dragon_launcher.py | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index acb390606..1abeb4b08 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -234,7 +234,7 @@ def _load_persisted_env(self) -> t.Dict[str, str]: return self._env_vars - def _merge_persisted_env(self, current_env: t.Dict[str, str]) -> t.Dict[str, str]: + def merge_persisted_env(self, current_env: t.Dict[str, str]) -> t.Dict[str, str]: """Combine the current environment variable set with the dragon .env by adding Dragon-specific values and prepending any new values to existing keys""" # ensure we start w/a complete env from current env state @@ -305,9 +305,12 @@ def connect_to_dragon(self) -> None: dragon_err_file = path / "dragon_head.err" self._load_persisted_env() - merged_env = self._merge_persisted_env(os.environ.copy()) + merged_env = self.merge_persisted_env(os.environ.copy()) merged_env.update({"PYTHONUNBUFFERED": "1"}) + logger.debug(merged_env["PATH"]) + logger.debug(merged_env["LD_LIBRARY_PATH"]) + with ( open(dragon_out_file, "w", encoding="utf-8") as dragon_out, open(dragon_err_file, "w", encoding="utf-8") as dragon_err, diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index f33c19ad5..b55c697e5 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -163,7 +163,9 @@ def run(self, step: Step) -> t.Optional[str]: step_id = "PBS-" + sublauncher_step_id elif isinstance(step, DragonStep): run_args = step.run_settings.run_args - env = step.run_settings.env_vars + req_env = step.run_settings.env_vars + self._connector._load_persisted_env() + merged_env = self._connector.merge_persisted_env(os.environ.copy()) nodes = int(run_args.get("nodes", None) or 1) tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) response = _assert_schema_type( @@ -175,8 +177,8 @@ def run(self, step: Step) -> t.Optional[str]: name=step.name, nodes=nodes, tasks_per_node=tasks_per_node, - env=env, - current_env=os.environ, + env=req_env, + current_env=merged_env, output_file=out, error_file=err, ) diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 316b9ada2..37cf4cf2e 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -506,7 +506,7 @@ def test_merge_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): "ONLY_IN_CURRENT": curr_only, # expect pass-through } - merged_env = connector._merge_persisted_env(curr_env) + merged_env = connector.merge_persisted_env(curr_env) # any dragon env vars should be overwritten assert merged_env["DRAGON_BASE_DIR"] != curr_base_dir From fa4ede69b6455ac1a615d8625195f8ad9545db7e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 10 May 2024 11:08:49 -0500 Subject: [PATCH 095/101] Address @ankona's review --- smartsim/_core/entrypoints/dragon_client.py | 31 +++++++++++++-- .../_core/launcher/dragon/dragonConnector.py | 38 ++++++++++++++++++- 2 files changed, 64 insertions(+), 5 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py index 2e51ba603..fffe28264 100644 --- a/smartsim/_core/entrypoints/dragon_client.py +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -47,7 +47,8 @@ from smartsim.log import get_logger """ -Dragon server entrypoint script +Dragon client entrypoint script, used to start a server, send requests to it +and then shut it down. """ logger = get_logger("Dragon Client") @@ -61,10 +62,16 @@ class DragonClientEntrypointArgs: def cleanup() -> None: + """Cleanup resources""" logger.debug("Cleaning up") def parse_requests(request_filepath: Path) -> t.List[DragonRequest]: + """Parse serialized requests from file + + :param request_filepath: Path to file with serialized requests + :return: Deserialized requests + """ requests: t.List[DragonRequest] = [] try: with open(request_filepath, "r", encoding="utf-8") as request_file: @@ -85,6 +92,12 @@ def parse_requests(request_filepath: Path) -> t.List[DragonRequest]: def parse_arguments(args: t.List[str]) -> DragonClientEntrypointArgs: + """Parse arguments used to run entrypoint script + + :param args: Arguments without name of executable + :raises ValueError: If the request file is not specified + :return: Parsed arguments + """ parser = argparse.ArgumentParser( prefix_chars="+", description="SmartSim Dragon Client Process, to be used in batch scripts", @@ -93,12 +106,17 @@ def parse_arguments(args: t.List[str]) -> DragonClientEntrypointArgs: args_ = parser.parse_args(args) if not args_.submit: - raise ValueError("Empty request file.") + raise ValueError("Request file not provided.") return DragonClientEntrypointArgs(submit=Path(args_.submit)) def handle_signal(signo: int, _frame: t.Optional[FrameType] = None) -> None: + """Handle signals sent to this process + + :param signo: Signal number + :param _frame: Frame, defaults to None + """ if not signo: logger.info("Received signal with no signo") else: @@ -107,14 +125,21 @@ def handle_signal(signo: int, _frame: t.Optional[FrameType] = None) -> None: def register_signal_handlers() -> None: + """Register signal handlers prior to execution + """ # make sure to register the cleanup before the start # the process so our signaller will be able to stop - # the database process. + # the server process. for sig in SIGNALS: signal.signal(sig, handle_signal) def execute_entrypoint(args: DragonClientEntrypointArgs) -> int: + """Execute the entrypoint with specified arguments + + :param args: Parsed arguments + :return: Return code + """ try: requests = parse_requests(args.submit) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 1abeb4b08..58a50dc7c 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -95,10 +95,18 @@ def __init__(self) -> None: @property def is_connected(self) -> bool: + """Whether the Connector established a connection to the server + + :return: True if connected + """ return self._dragon_head_socket is not None @property def can_monitor(self) -> bool: + """Whether the Connector knows the PID of the dragon server head process + and can monitor its status + + :return: True if the server can be monitored""" return self._dragon_head_pid is not None def _handshake(self, address: str) -> None: @@ -138,6 +146,13 @@ def _reset_timeout(self, timeout: int = get_config().dragon_server_timeout) -> N pass def ensure_connected(self) -> None: + """Ensure that the Connector established a connection to the server + + If the Connector is not connected, attempt to connect and raise an error + on failure. + + :raises SmartSimError: if connection cannot be established + """ if not self.is_connected: self.connect_to_dragon() if not self.is_connected: @@ -215,7 +230,9 @@ def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: return connector_socket def _load_persisted_env(self) -> t.Dict[str, str]: - """Load key-value pairs from a .env file created during dragon installation""" + """Load key-value pairs from a .env file created during dragon installation + + :return: Key-value pairs stored in .env file""" if self._env_vars: # use previously loaded env vars. return self._env_vars @@ -236,7 +253,11 @@ def _load_persisted_env(self) -> t.Dict[str, str]: def merge_persisted_env(self, current_env: t.Dict[str, str]) -> t.Dict[str, str]: """Combine the current environment variable set with the dragon .env by adding - Dragon-specific values and prepending any new values to existing keys""" + Dragon-specific values and prepending any new values to existing keys + + :param current_env: Environment which has to be merged with .env variables + :return: Merged environment + """ # ensure we start w/a complete env from current env state merged_env: t.Dict[str, str] = {**current_env} @@ -256,6 +277,10 @@ def merge_persisted_env(self, current_env: t.Dict[str, str]) -> t.Dict[str, str] return merged_env def connect_to_dragon(self) -> None: + """Connect to Dragon server + + :raises SmartSimError: If connection cannot be established + """ config = get_config() with DRG_LOCK: # TODO use manager instead @@ -368,6 +393,8 @@ def connect_to_dragon(self) -> None: logger.warning("Could not start Dragon server as subprocess") def cleanup(self) -> None: + """Shut down Dragon server and authenticator thread + """ if self._dragon_head_socket is not None and self._dragon_head_pid is not None: _dragon_cleanup( server_socket=self._dragon_head_socket, @@ -379,6 +406,13 @@ def cleanup(self) -> None: self._authenticator = None def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse: + """Send a request to the Dragon server using a secure socket + + :param request: The request to send + :param flags: 0MQ flags, defaults to 0 + :raises SmartSimError: If not connected to Dragon server + :return: Response from server + """ self.ensure_connected() if (socket := self._dragon_head_socket) is None: raise SmartSimError("Not connected to Dragon") From 2981d144c6648d099b0568b52572b92db9e993fc Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 10 May 2024 11:11:45 -0500 Subject: [PATCH 096/101] Style and lint --- smartsim/_core/entrypoints/dragon_client.py | 3 +-- smartsim/_core/launcher/dragon/dragonConnector.py | 7 +++---- smartsim/_core/launcher/dragon/dragonLauncher.py | 2 +- tests/test_dragon_launcher.py | 10 +++++----- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py index fffe28264..e998ddce1 100644 --- a/smartsim/_core/entrypoints/dragon_client.py +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -125,8 +125,7 @@ def handle_signal(signo: int, _frame: t.Optional[FrameType] = None) -> None: def register_signal_handlers() -> None: - """Register signal handlers prior to execution - """ + """Register signal handlers prior to execution""" # make sure to register the cleanup before the start # the process so our signaller will be able to stop # the server process. diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 58a50dc7c..1b6fc1a63 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -229,7 +229,7 @@ def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: return connector_socket - def _load_persisted_env(self) -> t.Dict[str, str]: + def load_persisted_env(self) -> t.Dict[str, str]: """Load key-value pairs from a .env file created during dragon installation :return: Key-value pairs stored in .env file""" @@ -329,7 +329,7 @@ def connect_to_dragon(self) -> None: dragon_out_file = path / "dragon_head.out" dragon_err_file = path / "dragon_head.err" - self._load_persisted_env() + self.load_persisted_env() merged_env = self.merge_persisted_env(os.environ.copy()) merged_env.update({"PYTHONUNBUFFERED": "1"}) @@ -393,8 +393,7 @@ def connect_to_dragon(self) -> None: logger.warning("Could not start Dragon server as subprocess") def cleanup(self) -> None: - """Shut down Dragon server and authenticator thread - """ + """Shut down Dragon server and authenticator thread""" if self._dragon_head_socket is not None and self._dragon_head_pid is not None: _dragon_cleanup( server_socket=self._dragon_head_socket, diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index b55c697e5..041257366 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -164,7 +164,7 @@ def run(self, step: Step) -> t.Optional[str]: elif isinstance(step, DragonStep): run_args = step.run_settings.run_args req_env = step.run_settings.env_vars - self._connector._load_persisted_env() + self._connector.load_persisted_env() merged_env = self._connector.merge_persisted_env(os.environ.copy()) nodes = int(run_args.get("nodes", None) or 1) tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 37cf4cf2e..ee0fcb14b 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -423,7 +423,7 @@ def test_load_env_no_file(monkeypatch: pytest.MonkeyPatch, test_dir: str): connector = DragonConnector() - loaded_env = connector._load_persisted_env() + loaded_env = connector.load_persisted_env() assert not loaded_env @@ -443,7 +443,7 @@ def test_load_env_env_file_created(monkeypatch: pytest.MonkeyPatch, test_dir: st # load config w/launcher connector = DragonConnector() - loaded_env = connector._load_persisted_env() + loaded_env = connector.load_persisted_env() assert loaded_env # confirm .env was parsed as expected by inspecting a key @@ -462,7 +462,7 @@ def test_load_env_cached_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): # load config w/launcher connector = DragonConnector() - loaded_env = connector._load_persisted_env() + loaded_env = connector.load_persisted_env() assert loaded_env # ensure attempting to reload would bomb @@ -470,7 +470,7 @@ def test_load_env_cached_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): # attempt to load and if it doesn't blow up, it used the cached copy - loaded_env = connector._load_persisted_env() + loaded_env = connector.load_persisted_env() assert loaded_env @@ -485,7 +485,7 @@ def test_merge_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): # load config w/launcher connector = DragonConnector() - loaded_env = {**connector._load_persisted_env()} + loaded_env = {**connector.load_persisted_env()} assert loaded_env curr_base_dir = "/foo" From 3dc0b9fff29767080042511bdf5fc81f1523000f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 10 May 2024 12:30:36 -0500 Subject: [PATCH 097/101] Adapt tests --- smartsim/_core/launcher/dragon/dragonConnector.py | 3 --- tests/on_wlm/test_preview_wlm.py | 14 ++++++++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 1b6fc1a63..0cd68c24e 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -333,9 +333,6 @@ def connect_to_dragon(self) -> None: merged_env = self.merge_persisted_env(os.environ.copy()) merged_env.update({"PYTHONUNBUFFERED": "1"}) - logger.debug(merged_env["PATH"]) - logger.debug(merged_env["LD_LIBRARY_PATH"]) - with ( open(dragon_out_file, "w", encoding="utf-8") as dragon_out, open(dragon_err_file, "w", encoding="utf-8") as dragon_err, diff --git a/tests/on_wlm/test_preview_wlm.py b/tests/on_wlm/test_preview_wlm.py index 72fc7564c..78da30c9a 100644 --- a/tests/on_wlm/test_preview_wlm.py +++ b/tests/on_wlm/test_preview_wlm.py @@ -105,9 +105,10 @@ def test_preview_wlm_run_commands_cluster_orc_model( output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") # Evaluate output - assert "Run Command" in output + if pytest.test_launcher != "dragon": + assert "Run Command" in output + assert "ntasks" in output assert "Run Arguments" in output - assert "ntasks" in output assert "nodes" in output @@ -133,11 +134,12 @@ def test_preview_model_on_wlm(fileutils, test_dir, wlmutils): # Execute preview method output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") - assert "Run Command" in output + if pytest.test_launcher != "dragon": + assert "Run Command" in output + assert "ntasks" in output + assert "time" in output + assert "nodes" in output assert "Run Arguments" in output - assert "nodes" in output - assert "ntasks" in output - assert "time" in output @pytest.mark.skipif( From adedbe440a24cb387592167da77fdc983ed31764 Mon Sep 17 00:00:00 2001 From: Amanda Richardson Date: Fri, 10 May 2024 11:45:16 -0700 Subject: [PATCH 098/101] addressing dragon comments --- doc/api/smartsim_api.rst | 1 + doc/dragon.rst | 173 ++++++++++++++++++++------------------- doc/experiment.rst | 4 +- doc/run_settings.rst | 22 +++++ 4 files changed, 113 insertions(+), 87 deletions(-) diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index 22d17c835..88c173783 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -60,6 +60,7 @@ Types of Settings: MpiexecSettings OrterunSettings JsrunSettings + DragonRunSettings SbatchSettings QsubBatchSettings BsubBatchSettings diff --git a/doc/dragon.rst b/doc/dragon.rst index f75280242..753022118 100644 --- a/doc/dragon.rst +++ b/doc/dragon.rst @@ -2,121 +2,124 @@ Dragon ****** -============ -Introduction -============ +======== +Overview +======== `Dragon `_ is a composable distributed run-time targeting HPC workflows. In SmartSim, Dragon can be used as a launcher, within a Slurm or PBS allocation or batch job. -.. note:: - The Dragon launcher is at an early development stage and should be considered - a prototype implementation. Please report any issue you encounter while using - it and provide feedback about missing features you would like to see - implemented. +.. warning:: + The Dragon launcher is currently in its early development stage and should be treated as + a prototype implementation. Your assistance is invaluable in identifying any issues + encountered during usage and suggesting missing features for implementation. Please + provide feedback in the form of a created issue on the + `SmartSim issues page `_ on github. ===== Usage ===== -To be able to use Dragon, you will have to install it in your current Python -environment. This can be done as part of the ``smart build`` step, as explained -in :ref:`Dragon Install `. +To use Dragon, you need to install it in your current Python environment. This can +be accomplished by providing the ``--dragon`` flag to the ``smart build`` command, as +detailed in the :ref:`Dragon Install `. Note that specifying the device +configuration is also required for a proper build. -Once installed, Dragon can be selected as launcher when creating an ``Experiment``: +After installation, specify Dragon as the launcher when creating an ``Experiment``: .. code-block:: python exp = Experiment(name="dragon-example", launcher="dragon") - -Dragon has its own run settings class, ``DragonRunSettings``, -which can be used to specify nodes and tasks per node for a ``Model``, -for example, continuing from the previous example: +Dragon introduces its own run settings class, ``DragonRunSettings``, which allows users to +specify nodes and tasks per node for a ``Model``. For instance, continuing from the previous +example: .. code-block:: python + # Because "dragon" was specified as the launcher during Experiment initialization, + # create_run_settings will return a DragonRunSettings object rs = exp.create_run_settings(exe="mpi_app", exe_args=["--option", "value"], env_vars={"MYVAR": "VALUE"}) + # Above we specify the executable (exe), executable arguments (exe_args) + # and environment variables (env_vars) + + # Sets the number of nodes for this job rs.set_nodes(4) + # Set the tasks per node for this job rs.set_tasks_per_node(3) + # Initialize the Model and pass in the DragonRunSettings object mpi_app = exp.create_model("MPI_APP", run_settings=rs) + # Start the Model exp.start(mpi_app) +SmartSim supports ``DragonRunSettings`` with ``Model``, ``Ensemble`` and ``Orchestrator`` entities. +In the next sections, we detail how Dragon is integrated into SmartSim. -All types of SmartSim entities are supported, including ``Ensemble`` -and ``Orchestrator``, and the underlying Dragon launcher is completely -transparent to the user. In the next sections, we will explain -how Dragon is integrated into SmartSim. +For more information on HPC launchers, visit the :ref:`Run Settings` page. ================= The Dragon Server ================= -Dragon can start processes on any resource available within an allocation. -To do this, the so-called Dragon infrastructure needs to be started. SmartSim -instantiates the Dragon infrastructure whenever a ``Model`` needs to be started -and will keep it up and running until the parent ``Experiment`` is active. -To be able to interact with processes started through Dragon, -SmartSim spins up a command server in the Dragon infrastructure and sends commands -to it every time a process needs to be started or stopped, and to query its status. -We call this server the `Dragon Server`, and its lifecycle is managed by SmartSim. +Dragon can initiate processes on any available resource within an allocation. To facilitate +this, SmartSim initializes the Dragon infrastructure whenever a ``Model`` is launched and maintains +it until the parent ``Experiment`` concludes. To facilitate interaction with processes managed by +Dragon, SmartSim establishes a command server within the Dragon infrastructure. This server, +known as the `Dragon Server`, is responsible for executing commands to start or stop processes +and to query their status. Sharing the Dragon Server across Experiments ============================================ -Currently, SmartSim only supports one Dragon server per allocation. For this reason, -if multiple ``Experiments`` need to run in the same allocation, the Dragon server needs -to be shared among them. By default, the server is started from a subdirectory of the -``Experiment`` path. To make it possible to share the server, it is possible to -specify a path from which the Server should be started. This can be done -by setting the environment variable -``SMARTSIM_DRAGON_SERVER_PATH`` to an existing path: every ``Experiment`` will look for the running -server in the given path and only start a new server instance if there is none running. +Currently, SmartSim supports only one Dragon server per allocation. Consequently, +if multiple Experiments need to run within the same allocation, the Dragon server +must be shared among them. By default, the server is initiated from a subdirectory +of the ``Experiment`` path. To enable server sharing, users can specify a custom path +from which the server should be launched. This can be achieved by setting the +environment variable ``SMARTSIM_DRAGON_SERVER_PATH`` to an existing path. Each ``Experiment`` +will then search for the running server in the specified path and initiate a new +server instance only if none is already running. Dragon's High-Speed Transport Agents ==================================== -On systems where the HPE Slingshot interconnect is available, Dragon can use -Higs-Speed Transport Agents (HSTA) to send internal messages. This is the default -choice for messages sent in the Dragon infrastructure started by SmartSim. On -systems where the HPE Slingshot interconnect is not available, TCP agents must be -used. To specify TCP agents, the environment variable ``SMARTSIM_DRAGON_TRANSPORT`` -must be set to ``tcp`` prior to the ``Experiment`` execution. +On systems equipped with the HPE Slingshot interconnect, Dragon utilizes High-Speed +Transport Agents (HSTA) by default for internal messaging within the infrastructure +launched by SmartSim. On systems without the HPE Slingshot interconnect, +TCP agents are employed. To specify the use of TCP agents, users must set the environment +variable ``SMARTSIM_DRAGON_TRANSPORT`` to tcp prior to executing the Experiment. -============ -Communcation -============ +============= +Communication +============= SmartSim and the Dragon Server communicate using `ZeroMQ `_. -As with any communication protocol, some timeouts for send and receive operations must be defined. -SmartSim sets some default timeouts that have been tested to work on most available systems, -but if you see failed communication attempts, you may want to try to adjust the -timeouts by setting the corresponding environment variable. -The timeouts are given in milliseconds and they are defined as follows: - -- *server start-up timeout*: the time waited by the SmartSim ``Experiment`` when the server - is first started. This timeout must account for the time it takes Dragon to set up the - infrastructure, which depends on the system's workload manager response time. - Defaults to ``"300000"`` (i.e. five minutes) and can be overridden with the environment variable - ``SMARTSIM_DRAGON_STARTUP_TIMEOUT``. - -- *server send and receive timeout*: the time waited by SmartSim and the Dragon server to send or - receive a message. Defaults to ``"30000"`` (i.e. 30 seconds) and can be overridden with the - environment variable ``SMARTSIM_DRAGON_TIMEOUT``. +Similar to other communication protocols, defining timeouts for send and receive operations +is crucial in SmartSim. SmartSim configures default timeouts that have been tested on various +systems. However, if you encounter failed communication attempts, adjusting the timeouts may +be necessary. You can adjust these timeouts by setting the corresponding environment variables: -Setting any timeout to ``"-1"`` will result in infinite waiting time, which means that the -execution will block until the communication is completed, and hang indefinitely if something went wrong. +- **Server Start-up Timeout**: This timeout specifies the duration the SmartSim ``Experiment`` + waits when the server is initially started. It must accommodate the time required for + Dragon to set up the infrastructure, which varies based on the system's workload manager + response time. The default timeout is `"300000"` milliseconds (i.e., five minutes), and you can override + it using the ``SMARTSIM_DRAGON_STARTUP_TIMEOUT`` environment variable. +- **Server Send and Receive Timeout**: This timeout dictates how long SmartSim and the Dragon + server wait to send or receive a message. The default timeout is `"30000"` milliseconds (i.e., 30 seconds), + and you can modify it using the ``SMARTSIM_DRAGON_TIMEOUT`` environment variable. -All communications are secured with elliptic curve cryptography, -and the key-pairs needed by the protocol are created by SmartSim and stored in the -user's home directory, unless another path is specified through the environment variable -``SMARTSIM_KEY_PATH``. +Setting any timeout to "-1" will result in an infinite waiting time, causing the execution to +block until the communication is completed, potentially hanging indefinitely if issues occur. +It's important to note that all communications are secured with elliptic curve cryptography. +SmartSim generates the necessary key-pairs and stores them in the user's home directory by +default. However, you can specify an alternative path using the ``SMARTSIM_KEY_PATH`` environment +variable. .. _dragon_known_issues: @@ -124,31 +127,31 @@ user's home directory, unless another path is specified through the environment Known issues ============ -As previosuly remarked, the SmartSim-Dragon integration is at an early development stage -and there are some known issues that can lead to unexpected behavior during runs. +As previously noted, the integration of SmartSim with Dragon is still in its early +development stage, and there are known issues that may result in unexpected behavior +during runs: -- *Incomplete cleanup of Dragon resources*: when SmartSim exits, it ensures that the dragon - infrastructure is correctly shut down, so that all the associated resources (such as - shared memory segments) are cleaned up and all processes are terminated. Nevertheless, - in some rare cases, when the execution is interrupted abruptly (for example by terminating - SmartSim with ``SIGKILL``), the cleanup process can be incomplete and processes - such as the Dragon overlay network will remain active on the node where SmartSim was - executed (which could be a login node, especially on Slurm systems). If that happens - you can run +- **Incomplete cleanup of Dragon resources**: When SmartSim exits, it attempts to properly + shut down the Dragon infrastructure to clean up associated resources, such as shared memory + segments, and terminate all processes. However, in rare cases, if the execution is + abruptly interrupted (e.g., by terminating SmartSim with ``SIGKILL``), the cleanup process + may be incomplete, leaving processes like the Dragon overlay network active on the node + where SmartSim was executed (which could be a login node, particularly on Slurm systems). + If this occurs, you can use the following command to address the issue: .. code-block:: smart teardown --dragon - which will kill all Dragon related processes, return shared memory segments, but also - kill all Python processes (associated to your user name). + This command will terminate all Dragon-related processes, release shared memory segments, + but also terminate all Python processes associated with your username. -- *Dragon server not starting*: this can happen because of two main reasons +- **Dragon server not starting**: This issue may arise due to two main reasons: - 1. HSTA not available on the system: try setting the environment variable - ``SMARTSIM_DRAGON_TRANSPORT`` to ``tcp`` - 2. System or Workload Manager too busy: try setting the environment variable - ``SMARTSIM_DRAGON_STARTUP_TIMEOUT`` to a larger value or to ``"-1"``. + 1. *HSTA not available on the system*: Try setting the environment variable + ``SMARTSIM_DRAGON_TRANSPORT`` to ``tcp``. + 2. *System or Workload Manager too busy*: Attempt to mitigate this by setting the environment + variable ``SMARTSIM_DRAGON_STARTUP_TIMEOUT`` to a larger value or ``"-1"``. -- *MPI-based applications hanging*: to run MPI-based applications on Dragon, Cray PMI or Cray PALS - must be available on the system. This is a current limitation and is actively being worked on. \ No newline at end of file +- **MPI-based applications hanging**: To run MPI-based applications on Dragon, Cray PMI or + Cray PALS must be available on the system. This limitation is currently being addressed. \ No newline at end of file diff --git a/doc/experiment.rst b/doc/experiment.rst index 9b48d0f87..93797e910 100644 --- a/doc/experiment.rst +++ b/doc/experiment.rst @@ -59,8 +59,8 @@ manage, and monitor. Currently, SmartSim supports 7 `launcher` options: 3. ``pbs``: for systems using the PBS Pro scheduler 4. ``pals``: for systems using the PALS scheduler 5. ``lsf``: for systems using the LSF scheduler -6. ``auto``: have SmartSim auto-detect the launcher to use from the above -7. ``dragon``: if Dragon was installed in the current Python environment, see :ref:`Dragon Install ` +6. ``dragon``: if Dragon is installed in the current Python environment, see :ref:`Dragon Install ` +7. ``auto``: have SmartSim auto-detect the launcher to use from the above If the systems `launcher` cannot be found or no `launcher` argument is provided, the default value of `"local"` will be assigned which will start all ``Experiment`` launched entities on the diff --git a/doc/run_settings.rst b/doc/run_settings.rst index 449b61ea4..806a87155 100644 --- a/doc/run_settings.rst +++ b/doc/run_settings.rst @@ -304,6 +304,28 @@ for each job scheduler. Users may replace `mpirun` with `mpiexec` or `orterun`. + .. group-tab:: Dragon + The Dragon `launcher` does not support any launch binary. Below we step through initializing a ``DragonRunSettings`` instance on a Slurm + based machine. + + **DragonRunSettings** + + Run a job with the `dragon` launcher. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher dragon + exp = Experiment("name-of-experiment", launcher="dragon") + + # Initialize a DragonRunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World") + # Set the number of nodes for this job + run_settings.set_nodes(4) + # Set the number of tasks per node for this job + run_settings.set_tasks_per_node(10) + .. note:: SmartSim will look for an allocation by accessing the associated WLM job ID environment variable. If an allocation is present, the entity will be launched on the reserved compute resources. A user may also specify the allocation ID From d7fca0e18b2a0d6e9e61511b655312d153869c32 Mon Sep 17 00:00:00 2001 From: Amanda Richardson Date: Fri, 10 May 2024 12:07:06 -0700 Subject: [PATCH 099/101] broken link fix --- doc/dragon.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/dragon.rst b/doc/dragon.rst index 753022118..875b21343 100644 --- a/doc/dragon.rst +++ b/doc/dragon.rst @@ -15,7 +15,7 @@ Dragon can be used as a launcher, within a Slurm or PBS allocation or batch job. a prototype implementation. Your assistance is invaluable in identifying any issues encountered during usage and suggesting missing features for implementation. Please provide feedback in the form of a created issue on the - `SmartSim issues page `_ on github. + `SmartSim issues page `_ on github. ===== Usage From 498eb84924afed26157b478ba37cf4755402346b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 10 May 2024 23:12:53 +0200 Subject: [PATCH 100/101] Address @amandarichardsonn's remaining comments --- README.md | 6 +++++- doc/dragon.rst | 32 +++++++++++++++++++++----------- doc/experiment.rst | 5 ++++- doc/run_settings.rst | 17 +++++++++-------- 4 files changed, 39 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index cfd8d4271..c0986042e 100644 --- a/README.md +++ b/README.md @@ -174,13 +174,17 @@ system with which it has a corresponding `RunSettings` class. If one can be foun ## Experiments on HPC Systems SmartSim integrates with common HPC schedulers providing batch and interactive -launch capabilities for all applications. +launch capabilities for all applications: - Slurm - LSF - PBSPro - Local (for laptops/single node, no batch) +In addition, on Slurm and PBS systems, [Dragon](https://dragonhpc.github.io/dragon/doc/_build/html/index.html) +can be used as a launcher. Please refer to the documentation for instructions on +how to insall it on your system and use it in SmartSim. + ### Interactive Launch Example diff --git a/doc/dragon.rst b/doc/dragon.rst index 875b21343..89de718fd 100644 --- a/doc/dragon.rst +++ b/doc/dragon.rst @@ -9,13 +9,20 @@ Overview `Dragon `_ is a composable distributed run-time targeting HPC workflows. In SmartSim, Dragon can be used as a launcher, within a Slurm or PBS allocation or batch job. +The SmartSim team collaborates with the Dragon team to develop an efficient +launcher which will enable fast, interactive, and customized execution of +complex workflows on large HPC systems. As Dragon is scheduler-agonstic, +the same SmartSim script using Dragon as a launcher can be run indifferently +on a Slurm or PBS system. Support for other schedulers is coming soon. .. warning:: The Dragon launcher is currently in its early development stage and should be treated as a prototype implementation. Your assistance is invaluable in identifying any issues encountered during usage and suggesting missing features for implementation. Please provide feedback in the form of a created issue on the - `SmartSim issues page `_ on github. + `SmartSim issues page `_ on GitHub. + The :ref:`Known Issues section` is also a good starting + point when troubleshooting workflows run through the Dragon launcher. ===== Usage @@ -76,12 +83,13 @@ Sharing the Dragon Server across Experiments Currently, SmartSim supports only one Dragon server per allocation. Consequently, if multiple Experiments need to run within the same allocation, the Dragon server -must be shared among them. By default, the server is initiated from a subdirectory -of the ``Experiment`` path. To enable server sharing, users can specify a custom path +must be shared among them. By default, the server starts from a subdirectory +of the ``Experiment`` path, where it creates a configuration file. +To enable server sharing, users can specify a custom path from which the server should be launched. This can be achieved by setting the -environment variable ``SMARTSIM_DRAGON_SERVER_PATH`` to an existing path. Each ``Experiment`` -will then search for the running server in the specified path and initiate a new -server instance only if none is already running. +environment variable ``SMARTSIM_DRAGON_SERVER_PATH`` to an existing absolute path. +Each ``Experiment`` will then search for the configuration file in the specified path +and initiate a new server instance only if the file is not found. Dragon's High-Speed Transport Agents ==================================== @@ -90,7 +98,8 @@ On systems equipped with the HPE Slingshot interconnect, Dragon utilizes High-Sp Transport Agents (HSTA) by default for internal messaging within the infrastructure launched by SmartSim. On systems without the HPE Slingshot interconnect, TCP agents are employed. To specify the use of TCP agents, users must set the environment -variable ``SMARTSIM_DRAGON_TRANSPORT`` to tcp prior to executing the Experiment. +variable ``SMARTSIM_DRAGON_TRANSPORT`` to ``tcp`` prior to executing the Experiment. +To specify HSTA, ``SMARTSIM_DRAGON_TRANSPORT`` can be set to ``hsta`` or left unset. ============= Communication @@ -100,7 +109,8 @@ SmartSim and the Dragon Server communicate using `ZeroMQ `_ Similar to other communication protocols, defining timeouts for send and receive operations is crucial in SmartSim. SmartSim configures default timeouts that have been tested on various -systems. However, if you encounter failed communication attempts, adjusting the timeouts may +systems, such as Polaris, Perlmutter, and other HPE Cray EX and Apollo systems. +However, if you encounter failed communication attempts, adjusting the timeouts may be necessary. You can adjust these timeouts by setting the corresponding environment variables: - **Server Start-up Timeout**: This timeout specifies the duration the SmartSim ``Experiment`` @@ -116,10 +126,10 @@ be necessary. You can adjust these timeouts by setting the corresponding environ Setting any timeout to "-1" will result in an infinite waiting time, causing the execution to block until the communication is completed, potentially hanging indefinitely if issues occur. -It's important to note that all communications are secured with elliptic curve cryptography. +It's important to note that all communications are secured with `elliptic curve cryptography `_. SmartSim generates the necessary key-pairs and stores them in the user's home directory by -default. However, you can specify an alternative path using the ``SMARTSIM_KEY_PATH`` environment -variable. +default. However, you can specify an alternative absolute path using the ``SMARTSIM_KEY_PATH`` +environment variable. .. _dragon_known_issues: diff --git a/doc/experiment.rst b/doc/experiment.rst index 93797e910..716df1228 100644 --- a/doc/experiment.rst +++ b/doc/experiment.rst @@ -60,7 +60,10 @@ manage, and monitor. Currently, SmartSim supports 7 `launcher` options: 4. ``pals``: for systems using the PALS scheduler 5. ``lsf``: for systems using the LSF scheduler 6. ``dragon``: if Dragon is installed in the current Python environment, see :ref:`Dragon Install ` -7. ``auto``: have SmartSim auto-detect the launcher to use from the above +7. ``auto``: have SmartSim auto-detect the launcher to use (will not detect ``dragon``) + +The :ref:`Dragon-based launcher ` can be run on PBS- or Slurm-based systems +(MPI applications are supported only when Cray PMI or Cray PALS are available). If the systems `launcher` cannot be found or no `launcher` argument is provided, the default value of `"local"` will be assigned which will start all ``Experiment`` launched entities on the diff --git a/doc/run_settings.rst b/doc/run_settings.rst index 806a87155..ed12df8cb 100644 --- a/doc/run_settings.rst +++ b/doc/run_settings.rst @@ -176,6 +176,13 @@ for each job scheduler. Users may replace `mpirun` with `mpiexec` or `orterun`. + + .. note:: + SmartSim will look for an allocation by accessing the associated WLM job ID environment variable. If an allocation + is present, the entity will be launched on the reserved compute resources. A user may also specify the allocation ID + when initializing a run settings object via the `alloc` argument. If an allocation is specified, the entity receiving + these run parameters will launch on that allocation. + .. group-tab:: PBS Pro The PBS Pro `launcher` supports the :ref:`AprunSettings API ` as well as the :ref:`MpirunSettings API `, :ref:`MpiexecSettings API ` and :ref:`OrterunSettings API ` that each can be used to run executables @@ -305,8 +312,8 @@ for each job scheduler. Users may replace `mpirun` with `mpiexec` or `orterun`. .. group-tab:: Dragon - The Dragon `launcher` does not support any launch binary. Below we step through initializing a ``DragonRunSettings`` instance on a Slurm - based machine. + The Dragon `launcher` does not need any launch binary. Below we step through initializing a ``DragonRunSettings`` instance on a Slurm- + or PBS-based machine. **DragonRunSettings** @@ -325,9 +332,3 @@ for each job scheduler. run_settings.set_nodes(4) # Set the number of tasks per node for this job run_settings.set_tasks_per_node(10) - -.. note:: - SmartSim will look for an allocation by accessing the associated WLM job ID environment variable. If an allocation - is present, the entity will be launched on the reserved compute resources. A user may also specify the allocation ID - when initializing a run settings object via the `alloc` argument. If an allocation is specified, the entity receiving - these run parameters will launch on that allocation. \ No newline at end of file From a30c61ddbb645bddcffe8548e8677f3122476341 Mon Sep 17 00:00:00 2001 From: Amanda Richardson Date: Fri, 10 May 2024 14:41:42 -0700 Subject: [PATCH 101/101] update link --- doc/dragon.rst | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/dragon.rst b/doc/dragon.rst index 89de718fd..0bf6a8ea3 100644 --- a/doc/dragon.rst +++ b/doc/dragon.rst @@ -6,23 +6,22 @@ Dragon Overview ======== -`Dragon `_ is a -composable distributed run-time targeting HPC workflows. In SmartSim, +Dragon is a composable distributed run-time targeting HPC workflows. In SmartSim, Dragon can be used as a launcher, within a Slurm or PBS allocation or batch job. The SmartSim team collaborates with the Dragon team to develop an efficient launcher which will enable fast, interactive, and customized execution of -complex workflows on large HPC systems. As Dragon is scheduler-agonstic, +complex workflows on large HPC systems. As Dragon is scheduler-agnostic, the same SmartSim script using Dragon as a launcher can be run indifferently -on a Slurm or PBS system. Support for other schedulers is coming soon. +on a Slurm or PBS system. Support for additional schedulers is coming soon. .. warning:: The Dragon launcher is currently in its early development stage and should be treated as a prototype implementation. Your assistance is invaluable in identifying any issues encountered during usage and suggesting missing features for implementation. Please provide feedback in the form of a created issue on the - `SmartSim issues page `_ on GitHub. + `SmartSim issues GitHub page `_. The :ref:`Known Issues section` is also a good starting - point when troubleshooting workflows run through the Dragon launcher. + point when troubleshooting workflows run using the Dragon launcher. ===== Usage @@ -77,7 +76,6 @@ Dragon, SmartSim establishes a command server within the Dragon infrastructure. known as the `Dragon Server`, is responsible for executing commands to start or stop processes and to query their status. - Sharing the Dragon Server across Experiments ============================================ @@ -164,4 +162,8 @@ during runs: variable ``SMARTSIM_DRAGON_STARTUP_TIMEOUT`` to a larger value or ``"-1"``. - **MPI-based applications hanging**: To run MPI-based applications on Dragon, Cray PMI or - Cray PALS must be available on the system. This limitation is currently being addressed. \ No newline at end of file + Cray PALS must be available on the system. This limitation is currently being addressed. + + +Interested users can learn more about the Dragon project at the external +`Dragon documentation page `_. \ No newline at end of file