Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Application executes before colocated Orchestrator is created #522

Merged
merged 21 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions smartsim/_core/entrypoints/colocated.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@
import tempfile
import typing as t
from pathlib import Path
from subprocess import PIPE, STDOUT
from subprocess import STDOUT
from types import FrameType

import filelock
import psutil

from smartredis import Client, ConfigOptions
from smartredis.error import RedisConnectionError, RedisReplyError

from smartsim._core.utils.network import current_ip
from smartsim.error import SSInternalError
from smartsim.log import get_logger
Expand Down Expand Up @@ -177,6 +177,7 @@ def main(
db_scripts: t.List[t.List[str]],
db_identifier: str,
) -> None:
# pylint: disable=too-many-statements
global DBPID # pylint: disable=global-statement

lo_address = current_ip("lo")
Expand All @@ -201,8 +202,10 @@ def main(
# we generally want to catch all exceptions here as
# if this process dies, the application will most likely fail
try:
process = psutil.Popen(cmd, stdout=PIPE, stderr=STDOUT)
DBPID = process.pid
with open("colo_orch_output.txt", "w", encoding="utf-8") as file:
amandarichardsonn marked this conversation as resolved.
Show resolved Hide resolved
process = psutil.Popen(cmd, stdout=file.fileno(), stderr=STDOUT)
DBPID = process.pid
print(f"__PID__{DBPID}__PID__", flush=True)
amandarichardsonn marked this conversation as resolved.
Show resolved Hide resolved

except Exception as e:
cleanup()
Expand Down Expand Up @@ -249,9 +252,6 @@ def launch_db_scripts(client: Client, db_scripts: t.List[t.List[str]]) -> None:
# Make sure we don't keep this around
del client

for line in iter(process.stdout.readline, b""):
print(line.decode("utf-8").rstrip(), flush=True)

except Exception as e:
cleanup()
logger.error(f"Colocated database process failed: {str(e)}")
Expand Down
15 changes: 9 additions & 6 deletions smartsim/_core/launcher/colocated.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def write_colocated_launch_script(
script_file.write("set -e\n\n")

script_file.write("Cleanup () {\n")
script_file.write("echo error detected cleanup triggered\n")
amandarichardsonn marked this conversation as resolved.
Show resolved Hide resolved
script_file.write("echo $DBPID\n")
script_file.write("if ps -p $DBPID > /dev/null; then\n")
script_file.write("\tkill -15 $DBPID\n")
script_file.write("fi\n}\n\n")
Expand All @@ -67,9 +69,12 @@ def write_colocated_launch_script(
# STDOUT of the job
if colocated_settings["debug"]:
script_file.write("export SMARTSIM_LOG_LEVEL=debug\n")

script_file.write(f"{colocated_cmd}\n")
script_file.write("DBPID=$!\n\n")
script_file.write(f"db_stdout=$({colocated_cmd})\n")
# pylint: disable=anomalous-backslash-in-string
sed_command = "sed -n 's/.*__PID__\([0-9]*\)__PID__.*/\\1/p'"
printable_command = repr(sed_command)
print(printable_command)
amandarichardsonn marked this conversation as resolved.
Show resolved Hide resolved
script_file.write(f"DBPID=$(echo $db_stdout | {sed_command})\n")

# Write the actual launch command for the app
script_file.write("$@\n\n")
Expand Down Expand Up @@ -190,10 +195,8 @@ def _build_colocated_wrapper_cmd(
db_script_cmd = _build_db_script_cmd(db_scripts)
db_cmd.extend(db_script_cmd)

# run colocated db in the background
db_cmd.append("&")

cmd.extend(db_cmd)

return " ".join(cmd)


Expand Down
Loading