CrayLabs · al-rigazzi · Apr 4, 2024 · Jan 24, 2024 · Jan 24, 2024 · Jan 24, 2024
diff --git a/.github/workflows/changelog.yml b/.github/workflows/changelog.yml
@@ -28,22 +28,22 @@
 
 name: enforce_changelog
 
-on:
-  pull_request:
-  push:
-    branches:
-      - develop
+# on:
+#   pull_request:
+#   push:
+#     branches:
+#       - develop
 
 jobs:
   changelog:
     name: check_changelog
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v4
+      - uses: actions/checkout@v4
 
-    - name: Changelog Enforcer
-      uses: dangoslen/changelog-enforcer@v3.6.0
-      with:
-        changeLogPath: './doc/changelog.rst'
-        missingUpdateErrorMessage: 'changelog.rst has not been updated'
+      - name: Changelog Enforcer
+        uses: dangoslen/changelog-enforcer@v3.6.0
+        with:
+          changeLogPath: "./doc/changelog.rst"
+          missingUpdateErrorMessage: "changelog.rst has not been updated"
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -57,12 +57,10 @@ jobs:
         os: [macos-12, macos-14, ubuntu-20.04] # Operating systems
         compiler: [8] # GNU compiler version
         rai: [1.2.7] # Redis AI versions
-        py_v: ["3.8", "3.9", "3.10", "3.11"] # Python versions
+        py_v: ["3.9", "3.10", "3.11"] # Python versions
         exclude:
           - os: macos-14
             py_v: "3.9"
-          - os: macos-14
-            py_v: "3.8"
 
     env:
       SMARTSIM_REDISAI: ${{ matrix.rai }}

diff --git a/conftest.py b/conftest.py
@@ -31,8 +31,10 @@
 import os
 import pathlib
 import shutil
+import subprocess
 import sys
 import tempfile
+import time
 import typing as t
 import uuid
 import warnings
@@ -43,14 +45,17 @@
 
 import smartsim
 from smartsim import Experiment
+from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher, _dragon_cleanup
 from smartsim._core.config import CONFIG
 from smartsim._core.config.config import Config
 from smartsim._core.utils.telemetry.telemetry import JobEntity
 from smartsim.database import Orchestrator
 from smartsim.entity import Model
 from smartsim.error import SSConfigError
+from smartsim.log import get_logger
 from smartsim.settings import (
     AprunSettings,
+    DragonRunSettings,
     JsrunSettings,
     MpiexecSettings,
     MpirunSettings,
@@ -59,6 +64,8 @@
     SrunSettings,
 )
 
+logger = get_logger(__name__)
+
 # pylint: disable=redefined-outer-name,invalid-name,global-statement
 
 # Globals, yes, but its a testing file
@@ -72,6 +79,9 @@
 test_port = CONFIG.test_port
 test_account = CONFIG.test_account or ""
 test_batch_resources: t.Dict[t.Any, t.Any] = CONFIG.test_batch_resources
+test_output_dirs = 0
+mpi_app_exe = None
+built_mpi_app = False
 
 # Fill this at runtime if needed
 test_hostlist = None
@@ -107,7 +117,7 @@ def print_test_configuration() -> None:
 
 def pytest_configure() -> None:
     pytest.test_launcher = test_launcher
-    pytest.wlm_options = ["slurm", "pbs", "lsf", "pals"]
+    pytest.wlm_options = ["slurm", "pbs", "lsf", "pals", "dragon"]
     account = get_account()
     pytest.test_account = account
     pytest.test_device = test_device
@@ -124,6 +134,14 @@ def pytest_sessionstart(
     if os.path.isdir(test_output_root):
         shutil.rmtree(test_output_root)
     os.makedirs(test_output_root)
+    while not os.path.isdir(test_output_root):
+        time.sleep(0.1)
+
+    if CONFIG.dragon_server_path is None:
+        dragon_server_path =  os.path.join(test_output_root, "dragon_server")
+        os.makedirs(dragon_server_path)
+        os.environ["SMARTSIM_DRAGON_SERVER_PATH"] = dragon_server_path
+
     print_test_configuration()
 
 
@@ -135,10 +153,58 @@ def pytest_sessionfinish(
     returning the exit status to the system.
     """
     if exitstatus == 0:
-        shutil.rmtree(test_output_root)
+        cleanup_attempts = 5
+        while cleanup_attempts > 0:
+            try:
+                shutil.rmtree(test_output_root)
+            except OSError as e:
+                cleanup_attempts -= 1
+                time.sleep(1)
+                if not cleanup_attempts:
+                    raise
+            else:
+                break
+
+    # kill all spawned processes
+    kill_all_test_spawned_processes()
+
+
+def build_mpi_app() -> t.Optional[pathlib.Path]:
+    global built_mpi_app
+    built_mpi_app = True
+    cc = shutil.which("cc")
+    if cc is None:
+        cc = shutil.which("gcc")
+    if cc is None:
+        return None
+
+    path_to_src =  pathlib.Path(FileUtils().get_test_conf_path("mpi"))
+    path_to_out = pathlib.Path(test_output_root) / "apps" / "mpi_app"
+    os.makedirs(path_to_out.parent, exist_ok=True)
+    cmd = [cc, str(path_to_src / "mpi_hello.c"), "-o", str(path_to_out)]
+    proc = subprocess.Popen(cmd)
+    proc.wait(timeout=1)
+    if proc.returncode == 0:
+        return path_to_out
     else:
-        # kill all spawned processes in case of error
-        kill_all_test_spawned_processes()
+        return None
+
+@pytest.fixture(scope="session")
+def mpi_app_path() -> t.Optional[pathlib.Path]:
+    """Return path to MPI app if it was built
+
+        return None if it could not or will not be built
+    """
+    if not CONFIG.test_mpi:
+        return None
+
+    # if we already tried to build, return what we have
+    if built_mpi_app:
+        return mpi_app_exe
+
+    # attempt to build, set global
+    mpi_app_exe = build_mpi_app()
+    return mpi_app_exe
 
 
 def kill_all_test_spawned_processes() -> None:
@@ -156,6 +222,7 @@ def kill_all_test_spawned_processes() -> None:
         print("Not all processes were killed after test")
 
 
+
 def get_hostlist() -> t.Optional[t.List[str]]:
     global test_hostlist
     if not test_hostlist:
@@ -252,6 +319,11 @@ def get_base_run_settings(
             run_args.update(kwargs)
             settings = RunSettings(exe, args, run_command="srun", run_args=run_args)
             return settings
+        if test_launcher == "dragon":
+            run_args = {"nodes": nodes}
+            run_args.update(kwargs)
+            settings = RunSettings(exe, args, run_command="", run_args=run_args)
+            return settings
         if test_launcher == "pbs":
             if shutil.which("aprun"):
                 run_command = "aprun"
@@ -293,6 +365,11 @@ def get_run_settings(
             run_args = {"nodes": nodes, "ntasks": ntasks, "time": "00:10:00"}
             run_args.update(kwargs)
             return SrunSettings(exe, args, run_args=run_args)
+        if test_launcher == "dragon":
+            run_args = {"nodes": nodes}
+            run_args.update(kwargs)
+            settings = DragonRunSettings(exe, args, run_args=run_args)
+            return settings
         if test_launcher == "pbs":
             if shutil.which("aprun"):
                 run_args = {"pes": ntasks}
@@ -351,6 +428,14 @@ def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator:
                 interface=test_nic,
                 launcher=test_launcher,
             )
+        if test_launcher == "dragon":
+            return Orchestrator(
+                db_nodes=nodes,
+                port=test_port,
+                batch=batch,
+                interface=test_nic,
+                launcher=test_launcher,
+            )
         if test_launcher == "lsf":
             return Orchestrator(
                 db_nodes=nodes,
@@ -443,6 +528,14 @@ def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.delenv("SSKEYOUT", raising=False)
 
 
+@pytest.fixture(scope="function", autouse=True)
+def check_output_dir() -> None:
+    global test_output_dirs
+    assert os.path.isdir(test_output_root)
+    assert len(os.listdir(test_output_root)) >= test_output_dirs
+    test_output_dirs = len(os.listdir(test_output_root))
+
+
 @pytest.fixture
 def dbutils() -> t.Type[DBUtils]:
     return DBUtils
@@ -678,6 +771,28 @@ def setup_test_colo(
         return colo_model
 
 
+@pytest.fixture(scope="function")
+def global_dragon_teardown() -> None:
+    """Connect to a dragon server started at the path indicated by
+    the environment variable SMARTSIM_DRAGON_SERVER_PATH and
+    force its shutdown to bring down the runtime and allow a subsequent
+    allocation of a new runtime.
+    """
+    if test_launcher != "dragon" or CONFIG.dragon_server_path is None:
+        return
+    exp_path = os.path.join(test_output_root, "dragon_teardown")
+    os.makedirs(exp_path, exist_ok=True)
+    exp: Experiment = Experiment("dragon_shutdown", exp_path=exp_path, launcher=test_launcher)
+    rs = exp.create_run_settings("sleep", ["0.1"])
+    model = exp.create_model("dummy", run_settings=rs)
+    exp.generate(model, overwrite=True)
+    exp.start(model, block=True)
+
+    launcher: DragonLauncher = exp._control._launcher
+    launcher.cleanup()
+    time.sleep(5)
+
+
 @pytest.fixture
 def config() -> Config:
     return CONFIG

diff --git a/pyproject.toml b/pyproject.toml
@@ -78,7 +78,7 @@ namespace_packages = true
 files = [
   "smartsim"
 ]
-plugins = []
+plugins = ["pydantic.mypy"]
 ignore_errors = false
 
 # Dynamic typing
@@ -124,6 +124,7 @@ module = [
   "torch",
   "smartsim.ml.torch.*",            # must solve/ignore inheritance issues
   "watchdog",
+  "dragon.*",
 ]
 ignore_missing_imports = true
 ignore_errors = true
diff --git a/setup.py b/setup.py
@@ -169,6 +169,8 @@ def has_ext_modules(_placeholder):
     "filelock>=3.4.2",
     "protobuf~=3.20",
     "watchdog>=3.0.0,<4.0.0",
+    "pydantic==1.10.14",
+    "pyzmq>=25.1.2",
 ]
 
 # Add SmartRedis at specific version

diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py
@@ -30,7 +30,6 @@
 import multiprocessing as mp
 import os
 import os.path
-import socket
 import tempfile
 import typing as t
 from types import TracebackType
@@ -42,6 +41,7 @@
 from smartsim._core._cli.utils import SMART_LOGGER_FORMAT
 from smartsim._core._install.builder import Device
 from smartsim._core.utils.helpers import installed_redisai_backends
+from smartsim._core.utils.network import find_free_port
 from smartsim.log import get_logger
 
 logger = get_logger("Smart", fmt=SMART_LOGGER_FORMAT)
@@ -152,8 +152,8 @@ def test_install(
 ) -> None:
     exp = Experiment("ValidationExperiment", exp_path=location, launcher="local")
     exp.telemetry.disable()
+    port = find_free_port() if port is None else port
 
-    port = _find_free_port() if port is None else port
     with _make_managed_local_orc(exp, port) as client:
         logger.info("Verifying Tensor Transfer")
         client.put_tensor("plain-tensor", np.ones((1, 1, 3, 3)))
@@ -206,14 +206,6 @@ def _make_managed_local_orc(
         exp.stop(orc)
 
 
-def _find_free_port() -> int:
-    """A 'good enough' way to find an open port to bind to"""
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
-        sock.bind(("0.0.0.0", 0))
-        _, port = sock.getsockname()
-        return int(port)
-
-
 def _test_tf_install(client: Client, tmp_dir: str, device: Device) -> None:
     recv_conn, send_conn = mp.Pipe(duplex=False)
     # Build the model in a subproc so that keras does not hog the gpu