diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 09b680374..fc817cba9 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -44,23 +44,17 @@ env: jobs: run_tests: - name: Run tests with ${{ matrix.os }}, Python ${{ matrix.py_v}}, RedisAI ${{ matrix.rai }} + name: Run tests ${{ matrix.subset }} with ${{ matrix.os }}, Python ${{ matrix.py_v}}, RedisAI ${{ matrix.rai }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: + subset: [backends, slow_tests, group_a, group_b] os: [macos-12, ubuntu-20.04] # Operating systems compiler: [8] # GNU compiler version - rai: [1.2.5, 1.2.7] # Redis AI versions + rai: [1.2.7] # Redis AI versions py_v: [3.8, 3.9, '3.10'] # Python versions - exclude: - # Do not build with Redis AI 1.2.5 on MacOS - - os: macos-12 - rai: 1.2.5 - # Do not build Redis AI 1.2.5 with py3.10 - # as wheels for dependecies are not availble - - py_v: '3.10' - rai: 1.2.5 + env: SMARTSIM_REDISAI: ${{ matrix.rai }} @@ -86,7 +80,7 @@ jobs: echo "$(brew --prefix)/opt/make/libexec/gnubin" >> $GITHUB_PATH - name: Build Singularity from source - if: contains( matrix.os, 'ubuntu' ) && matrix.py_v == 3.9 && matrix.rai == '1.2.5' + if: contains( matrix.os, 'ubuntu' ) && matrix.py_v == 3.9 run: | sudo apt-get install -y libseccomp-dev pkg-config squashfs-tools cryptsetup curl git # wget build-essential echo 'export PATH=/usr/local/go/bin:$PATH' >> ~/.bashrc @@ -100,16 +94,18 @@ jobs: sudo make -C builddir install - name: singularity pull test container # This lets us time how long the pull takes - if: contains( matrix.os, 'ubuntu' ) && matrix.py_v == 3.9 && matrix.rai == '1.2.5' + if: contains( matrix.os, 'ubuntu' ) && matrix.py_v == 3.9 run: singularity pull docker://alrigazzi/smartsim-testing # Note: The develop branch of smartredis is installed first to ensure that any tests that depend # on developments of the client are brought in. - name: Install SmartSim (with ML backends) run: | + python -m pip install git+https://github.com/CrayLabs/SmartRedis.git@develop#egg=smartredis python -m pip install .[dev,ml] + - name: Install ML Runtimes with Smart (with pt, tf, and onnx support) if: (matrix.py_v != '3.10') run: smart build --device cpu --onnx -v @@ -119,8 +115,6 @@ jobs: run: smart build --device cpu -v - name: Run mypy - # TF 2.6.2 has a dep conflict with new mypy versions - if: (matrix.rai != '1.2.5') run: | python -m pip install .[mypy] make check-mypy @@ -128,13 +122,36 @@ jobs: - name: Run Pylint run: make check-lint + # Run pytest (backends subdirectory) - name: Run Pytest + if: (matrix.subset == 'backends') run: | echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV - py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ ./tests/ + py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ ./tests/backends + + # Run pytest (test subsets) + - name: Run Pytest + if: "!contains(matrix.subset, 'backends')" # if not running backend tests + run: | + echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV + py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ -m ${{ matrix.subset }} ./tests + + # Upload artifacts on failure, ignoring binary files + - name: Upload Artifact + if: failure() + uses: actions/upload-artifact@v3 + with: + name: test_artifact + path: | + tests/test_output + !**/*.so + !**/*.pb + !**/*.pt + !**/core + retention-days: 5 - name: Upload Pytest coverage to Codecov uses: codecov/codecov-action@v2 with: - fail_ci_if_error: true + fail_ci_if_error: false files: ./coverage.xml diff --git a/.gitignore b/.gitignore index d67b476fe..428e439b3 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,7 @@ __pycache__ htmlcov smartsim.egg-info tests/test_output -docs/* +.DS_Store # Dependencies smartsim/_core/.third-party @@ -31,3 +31,8 @@ smartsim/_core/bin/*-cli # created upon install smartsim/_core/lib + +**/manifest/ +**/*.err +**/*.out +**/.smartsim/* diff --git a/.pylintrc b/.pylintrc index 9ac79811c..da0886ba2 100644 --- a/.pylintrc +++ b/.pylintrc @@ -325,7 +325,7 @@ valid-metaclass-classmethod-first-arg=mcs max-args=9 # Maximum number of locals for function / method body -max-locals=19 +max-locals=20 # Maximum number of return / yield for function / method body max-returns=11 diff --git a/.wci.yml b/.wci.yml index 1c7ef5d72..55b5ddda1 100644 --- a/.wci.yml +++ b/.wci.yml @@ -22,8 +22,8 @@ language: Python release: - version: 0.5.1 - date: 2023-09-13 + version: 0.6.0 + date: 2023-12-18 documentation: general: https://www.craylabs.org/docs/overview.html diff --git a/MANIFEST.in b/MANIFEST.in index 885a8dbb4..20ba2dbe5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include Makefile graft smartsim/ - +global-exclude __pycache__ +global-exclude *.py[co] diff --git a/Makefile b/Makefile index 7b60f8874..fef69eab3 100644 --- a/Makefile +++ b/Makefile @@ -146,11 +146,11 @@ tutorials-dev: @docker compose build tutorials-dev @docker run -p 8888:8888 smartsim-tutorials:dev-latest -# help: tutorials-prod - Build and start a docker container to run the tutorials (v0.4.2) +# help: tutorials-prod - Build and start a docker container to run the tutorials (v0.6.0) .PHONY: tutorials-prod tutorials-prod: @docker compose build tutorials-prod - @docker run -p 8888:8888 smartsim-tutorials:v0.4.2 + @docker run -p 8888:8888 smartsim-tutorials:v0.6.0 # help: diff --git a/README.md b/README.md index 9e1902784..df671ef02 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,11 @@
Home    - Install    + Install    Documentation    - Slack Invite    Cray Labs    + Contact    + Join us on Slack!   


@@ -649,18 +650,6 @@ from C, C++, Fortran and Python with the SmartRedis Clients: ONNX 1.11.x - 1.2.5 - PyTorch - 1.9.x - - - TensorFlow\Keras - 2.6.x - - - ONNX - 1.9.x - diff --git a/conftest.py b/conftest.py index f20f48766..ff4e56ee1 100644 --- a/conftest.py +++ b/conftest.py @@ -28,7 +28,6 @@ import json import os -import inspect import pytest import psutil import shutil @@ -41,31 +40,37 @@ AprunSettings, JsrunSettings, MpirunSettings, + MpiexecSettings, + PalsMpiexecSettings, RunSettings, ) from smartsim._core.config import CONFIG from smartsim.error import SSConfigError from subprocess import run import sys +import tempfile import typing as t +import uuid +import warnings # pylint: disable=redefined-outer-name,invalid-name,global-statement # Globals, yes, but its a testing file test_path = os.path.dirname(os.path.abspath(__file__)) -test_dir = os.path.join(test_path, "tests", "test_output") +test_output_root = os.path.join(test_path, "tests", "test_output") test_launcher = CONFIG.test_launcher -test_device = CONFIG.test_device +test_device = CONFIG.test_device.upper() test_num_gpus = CONFIG.test_num_gpus test_nic = CONFIG.test_interface test_alloc_specs_path = os.getenv("SMARTSIM_TEST_ALLOC_SPEC_SHEET_PATH", None) test_port = CONFIG.test_port test_account = CONFIG.test_account or "" +test_batch_resources: t.Dict[t.Any,t.Any] = CONFIG.test_batch_resources # Fill this at runtime if needed test_hostlist = None - +has_aprun = shutil.which("aprun") is not None def get_account() -> str: return test_account @@ -77,14 +82,21 @@ def print_test_configuration() -> None: print("TEST_LAUNCHER:", test_launcher) if test_account != "": print("TEST_ACCOUNT:", test_account) - print("TEST_DEVICE:", test_device) + test_device_msg = f"TEST_DEVICE: {test_device}" + if test_device == "GPU": + test_device_msg += f"x{test_num_gpus}" + print(test_device_msg) print("TEST_NETWORK_INTERFACE (WLM only):", test_nic) if test_alloc_specs_path: print("TEST_ALLOC_SPEC_SHEET_PATH:", test_alloc_specs_path) - print("TEST_DIR:", test_dir) + print("TEST_DIR:", test_output_root) print("Test output will be located in TEST_DIR if there is a failure") - print("TEST_PORT", test_port) - print("TEST_PORT + 1", test_port + 1) + print( + "TEST_PORTS:", ", ".join(str(port) for port in range(test_port, test_port + 3)) + ) + if test_batch_resources: + print("TEST_BATCH_RESOURCES: ") + print(json.dumps(test_batch_resources, indent=2)) def pytest_configure() -> None: @@ -92,6 +104,8 @@ def pytest_configure() -> None: pytest.wlm_options = ["slurm", "pbs", "cobalt", "lsf", "pals"] account = get_account() pytest.test_account = account + pytest.test_device = test_device + pytest.has_aprun = has_aprun def pytest_sessionstart( @@ -101,9 +115,9 @@ def pytest_sessionstart( Called after the Session object has been created and before performing collection and entering the run test loop. """ - if os.path.isdir(test_dir): - shutil.rmtree(test_dir) - os.makedirs(test_dir) + if os.path.isdir(test_output_root): + shutil.rmtree(test_output_root) + os.makedirs(test_output_root) print_test_configuration() @@ -115,7 +129,7 @@ def pytest_sessionfinish( returning the exit status to the system. """ if exitstatus == 0: - shutil.rmtree(test_dir) + shutil.rmtree(test_output_root) else: # kill all spawned processes in case of error kill_all_test_spawned_processes() @@ -144,6 +158,12 @@ def get_hostlist() -> t.Optional[t.List[str]]: return _parse_hostlist_file(os.environ["COBALT_NODEFILE"]) except FileNotFoundError: return None + elif "PBS_NODEFILE" in os.environ and test_launcher == "pals": + # with PALS, we need a hostfile even if `aprun` is available + try: + return _parse_hostlist_file(os.environ["PBS_NODEFILE"]) + except FileNotFoundError: + return None elif "PBS_NODEFILE" in os.environ and not shutil.which("aprun"): try: return _parse_hostlist_file(os.environ["PBS_NODEFILE"]) @@ -216,6 +236,10 @@ def get_test_interface() -> t.List[str]: def get_test_hostlist() -> t.Optional[t.List[str]]: return get_hostlist() + @staticmethod + def get_batch_resources() -> t.Dict: + return test_batch_resources + @staticmethod def get_base_run_settings( exe: str, args: t.List[str], nodes: int = 1, ntasks: int = 1, **kwargs: t.Any @@ -320,7 +344,7 @@ def get_run_settings( @staticmethod def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: - if test_launcher in ["pbs", "cobalt", "pals"]: + if test_launcher in ["pbs", "cobalt"]: if not shutil.which("aprun"): hostlist = get_hostlist() else: @@ -333,6 +357,16 @@ def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: launcher=test_launcher, hosts=hostlist, ) + if test_launcher == "pals": + hostlist = get_hostlist() + return Orchestrator( + db_nodes=nodes, + port=test_port, + batch=batch, + interface=test_nic, + launcher=test_launcher, + hosts=hostlist, + ) if test_launcher == "slurm": return Orchestrator( db_nodes=nodes, @@ -355,18 +389,23 @@ def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: return Orchestrator(port=test_port, interface="lo") + @staticmethod + def choose_host(rs: RunSettings) -> t.Optional[str]: + if isinstance(rs, (MpirunSettings, MpiexecSettings)): + hl = get_hostlist() + if hl is not None: + return hl[0] + + return None @pytest.fixture def local_db( - fileutils: FileUtils, request: t.Any, wlmutils: t.Type[WLMUtils] + request: t.Any, wlmutils: t.Type[WLMUtils], test_dir: str ) -> t.Generator[Orchestrator, None, None]: """Yield fixture for startup and teardown of an local orchestrator""" exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir( - caller_function=exp_name, caller_fspath=request.fspath - ) + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) db = Orchestrator(port=wlmutils.get_test_port(), interface="lo") db.set_path(test_dir) exp.start(db) @@ -379,16 +418,13 @@ def local_db( @pytest.fixture def db( - fileutils: t.Type[FileUtils], wlmutils: t.Type[WLMUtils], request: t.Any + request: t.Any, wlmutils: t.Type[WLMUtils], test_dir: str ) -> t.Generator[Orchestrator, None, None]: """Yield fixture for startup and teardown of an orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir( - caller_function=exp_name, caller_fspath=request.fspath - ) + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) db = wlmutils.get_orchestrator() db.set_path(test_dir) exp.start(db) @@ -401,7 +437,7 @@ def db( @pytest.fixture def db_cluster( - fileutils: t.Type[FileUtils], wlmutils: t.Type[WLMUtils], request: t.Any + test_dir: str, wlmutils: t.Type[WLMUtils], request: t.Any ) -> t.Generator[Orchestrator, None, None]: """ Yield fixture for startup and teardown of a clustered orchestrator. @@ -410,10 +446,7 @@ def db_cluster( launcher = wlmutils.get_test_launcher() exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir( - caller_function=exp_name, caller_fspath=request.fspath - ) + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) db = wlmutils.get_orchestrator(nodes=3) db.set_path(test_dir) exp.start(db) @@ -426,7 +459,9 @@ def db_cluster( @pytest.fixture(scope="function", autouse=True) def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.delenv("SSDB", raising=False) + for key in os.environ.keys(): + if key.startswith("SSDB"): + monkeypatch.delenv(key, raising=False) monkeypatch.delenv("SSKEYIN", raising=False) monkeypatch.delenv("SSKEYOUT", raising=False) @@ -513,6 +548,38 @@ def get_config_edit_method( return config_edit_methods.get(config_setting, None) +def _sanitize_caller_function(caller_function: str) -> str: + # Parametrized test functions end with a list of all + # parameter values. The list is enclosed in square brackets. + # We split at the opening bracket, sanitize the string + # to its right and then merge the function name and + # the sanitized list with a dot. + caller_function = caller_function.replace("]","") + caller_function_list = caller_function.split("[", maxsplit=1) + + def is_accepted_char(char: str) -> bool: + return char.isalnum() or char in "-._" + + if len(caller_function_list) > 1: + caller_function_list[1] = "".join( + filter(is_accepted_char, caller_function_list[1]) + ) + + return ".".join(caller_function_list) + + +@pytest.fixture +def test_dir(request: pytest.FixtureRequest) -> str: + caller_function = _sanitize_caller_function(request.node.name) + dir_path = FileUtils.get_test_output_path(caller_function, str(request.path)) + + try: + os.makedirs(dir_path) + except Exception: + return dir_path + return dir_path + + @pytest.fixture def fileutils() -> t.Type[FileUtils]: return FileUtils @@ -520,86 +587,10 @@ def fileutils() -> t.Type[FileUtils]: class FileUtils: @staticmethod - def _test_dir_path(caller_function: str, caller_fspath: str) -> str: + def get_test_output_path(caller_function: str, caller_fspath: str) -> str: caller_file_to_dir = os.path.splitext(str(caller_fspath))[0] - rel_path = os.path.relpath(caller_file_to_dir, os.path.dirname(test_dir)) - dir_path = os.path.join(test_dir, rel_path, caller_function) - return dir_path - - @staticmethod - def get_test_dir( - caller_function: t.Optional[str] = None, - caller_fspath: t.Optional[str] = None, - level: int = 1, - ) -> str: - """Get path to test output. - - This function should be called without arguments from within - a test: the returned directory will be - `test_output///`. - When called from other functions (e.g. from functions in this file), - the caller function and the caller file path should be provided. - The directory will not be created, but the parent (and all the needed - tree) will. This is to allow tests to create the directory. - - :param caller_function: caller function name defaults to None - :type caller_function: str, optional - :param caller_fspath: absolute path to file containing caller, defaults to None - :type caller_fspath: str or Path, optional - :return: String path to test output directory - :rtype: str - """ - if not caller_function or not caller_fspath: - caller_frame = inspect.stack()[level] - caller_fspath = caller_frame.filename - caller_function = caller_frame.function - - dir_path = FileUtils._test_dir_path(caller_function, caller_fspath) - if not os.path.exists(os.path.dirname(dir_path)): - os.makedirs(os.path.dirname(dir_path)) - # dir_path = os.path.join(test_dir, dir_name) - return dir_path - - @staticmethod - def make_test_dir( - caller_function: t.Optional[str] = None, - caller_fspath: t.Optional[str] = None, - level: int = 1, - sub_dir: t.Optional[str] = None, - ) -> str: - """Create test output directory and return path to it. - - This function should be called without arguments from within - a test: the directory will be created as - `test_output///`. - When called from other functions (e.g. from functions in this file), - the caller function and the caller file path should be provided. - - :param caller_function: caller function name defaults to None - :type caller_function: str, optional - :param caller_fspath: absolute path to file containing caller, defaults to None - :type caller_fspath: str or Path, optional - :param level: indicate depth in the call stack relative to test method. - :type level: int, optional - :param sub_dir: a relative path to create in the test directory - :type sub_dir: str or Path, optional - - :return: String path to test output directory - :rtype: str - """ - if not caller_function or not caller_fspath: - caller_frame = inspect.stack()[level] - caller_fspath = caller_frame.filename - caller_function = caller_frame.function - - dir_path = FileUtils._test_dir_path(caller_function, caller_fspath) - if sub_dir: - dir_path = os.path.join(dir_path, sub_dir) - - try: - os.makedirs(dir_path) - except Exception: - return dir_path + rel_path = os.path.relpath(caller_file_to_dir, os.path.dirname(test_output_root)) + dir_path = os.path.join(test_output_root, rel_path, caller_function) return dir_path @staticmethod @@ -613,21 +604,23 @@ def get_test_dir_path(dirname: str) -> str: return dir_path @staticmethod - def make_test_file(file_name: str, file_dir: t.Optional[str] = None) -> str: + def make_test_file(file_name: str, file_dir: str, file_content: t.Optional[str] = None) -> str: """Create a dummy file in the test output directory. :param file_name: name of file to create, e.g. "file.txt" :type file_name: str - :param file_dir: path relative to test output directory, e.g. "deps/libs" + :param file_dir: path :type file_dir: str :return: String path to test output file :rtype: str """ - test_dir = FileUtils.make_test_dir(level=2, sub_dir=file_dir) - file_path = os.path.join(test_dir, file_name) - + file_path = os.path.join(file_dir, file_name) + os.makedirs(file_dir) with open(file_path, "w+", encoding="utf-8") as dummy_file: - dummy_file.write("dummy\n") + if not file_content: + dummy_file.write("dummy\n") + else: + dummy_file.write(file_content) return file_path @@ -658,32 +651,49 @@ def setup_test_colo( fileutils: t.Type[FileUtils], db_type: str, exp: Experiment, + application_file: str, db_args: t.Dict[str, t.Any], - colo_settings: t.Optional[t.Dict[str, t.Any]] = None, + colo_settings: t.Optional[RunSettings] = None, + colo_model_name: str = "colocated_model", + port: int = test_port, + on_wlm: bool = False, ) -> Model: - """Setup things needed for setting up the colo pinning tests""" + """Setup database needed for the colo pinning tests""" + # get test setup - test_dir = fileutils.make_test_dir(level=2) - sr_test_script = fileutils.get_test_conf_path("send_data_local_smartredis.py") + sr_test_script = fileutils.get_test_conf_path(application_file) # Create an app with a colo_db which uses 1 db_cpu if colo_settings is None: colo_settings = exp.create_run_settings( exe=sys.executable, exe_args=[sr_test_script] ) - colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) + if on_wlm: + colo_settings.set_tasks(1) + colo_settings.set_nodes(1) + colo_model = exp.create_model(colo_model_name, colo_settings) if db_type in ["tcp", "deprecated"]: - db_args["port"] = 6780 + db_args["port"] = port db_args["ifname"] = "lo" + if db_type == "uds" and colo_model_name is not None: + tmp_dir = tempfile.gettempdir() + socket_suffix = str(uuid.uuid4())[:7] + db_args["unix_socket"] = os.path.join(tmp_dir, + f"{colo_model_name}_{socket_suffix}.socket") colocate_fun: t.Dict[str, t.Callable[..., None]] = { "tcp": colo_model.colocate_db_tcp, "deprecated": colo_model.colocate_db, "uds": colo_model.colocate_db_uds, } - colocate_fun[db_type](**db_args) + with warnings.catch_warnings(): + if db_type == "deprecated": + warnings.filterwarnings( + "ignore", + message="`colocate_db` has been deprecated" + ) + colocate_fun[db_type](**db_args) # assert model will launch with colocated db assert colo_model.colocated # Check to make sure that limit_db_cpus made it into the colo settings diff --git a/doc/_static/version_names.json b/doc/_static/version_names.json index b6d8984e2..8ae78ebdb 100644 --- a/doc/_static/version_names.json +++ b/doc/_static/version_names.json @@ -1,7 +1,9 @@ { "version_names":[ "develop (unstable)", - "0.5.0 (stable)", + "0.6.0 (stable)", + "0.5.1", + "0.5.0", "0.4.2", "0.4.1", "0.4.0", @@ -10,6 +12,8 @@ "version_urls": [ "https://www.craylabs.org/develop/overview.html", "https://www.craylabs.org/docs/overview.html", + "https://www.craylabs.org/docs/versions/0.5.1/overview.html", + "https://www.craylabs.org/docs/versions/0.5.0/overview.html", "https://www.craylabs.org/docs/versions/0.4.2/overview.html", "https://www.craylabs.org/docs/versions/0.4.1/overview.html", "https://www.craylabs.org/docs/versions/0.4.0/overview.html", diff --git a/doc/api/smartredis_api.rst b/doc/api/smartredis_api.rst index 3a2d77e22..27838bf7b 100644 --- a/doc/api/smartredis_api.rst +++ b/doc/api/smartredis_api.rst @@ -1,7 +1,10 @@ +.. _smartredis-api: + ************** SmartRedis API ************** + Python ****** diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index 315c37b4e..5136c8aa5 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -4,10 +4,11 @@ SmartSim API ************* +.. _experiment_api: + Experiment ========== -.. _experiment_api: .. currentmodule:: smartsim.experiment @@ -73,10 +74,11 @@ Types of Containers: Singularity +.. _rs-api: + RunSettings ----------- -.. _rs-api: When running SmartSim on laptops and single node workstations, the base ``RunSettings`` object is used to parameterize jobs. @@ -96,10 +98,11 @@ launches that utilize a parallel launch binary like :members: +.. _srun_api: + SrunSettings ------------ -.. _srun_api: ``SrunSettings`` can be used for running on existing allocations, running jobs in interactive allocations, and for adding srun @@ -126,11 +129,11 @@ steps to a batch. :members: +.. _aprun_api: AprunSettings ------------- -.. _aprun_api: ``AprunSettings`` can be used on any system that supports the Cray ALPS layer. SmartSim supports using ``AprunSettings`` @@ -159,11 +162,11 @@ and within batch launches (e.g., ``QsubBatchSettings``) +.. _jsrun_api: JsrunSettings ------------- -.. _jsrun_api: ``JsrunSettings`` can be used on any system that supports the IBM LSF launcher. @@ -194,11 +197,11 @@ and within batch launches (i.e. ``BsubBatchSettings``) :undoc-members: :members: +.. _openmpi_run_api: MpirunSettings -------------- -.. _openmpi_run_api: ``MpirunSettings`` are for launching with OpenMPI. ``MpirunSettings`` are supported on Slurm, PBSpro, and Cobalt. @@ -221,11 +224,11 @@ supported on Slurm, PBSpro, and Cobalt. :undoc-members: :members: +.. _openmpi_exec_api: MpiexecSettings --------------- -.. _openmpi_exec_api: ``MpiexecSettings`` are for launching with OpenMPI's ``mpiexec``. ``MpirunSettings`` are supported on Slurm, PBSpro, and Cobalt. @@ -248,11 +251,11 @@ supported on Slurm, PBSpro, and Cobalt. :undoc-members: :members: +.. _openmpi_orte_api: OrterunSettings --------------- -.. _openmpi_orte_api: ``OrterunSettings`` are for launching with OpenMPI's ``orterun``. ``OrterunSettings`` are supported on Slurm, PBSpro, and Cobalt. @@ -279,11 +282,11 @@ supported on Slurm, PBSpro, and Cobalt. ------------------------------------------ +.. _sbatch_api: SbatchSettings -------------- -.. _sbatch_api: ``SbatchSettings`` are used for launching batches onto Slurm WLM systems. @@ -305,11 +308,11 @@ WLM systems. :undoc-members: :members: +.. _qsub_api: QsubBatchSettings ----------------- -.. _qsub_api: ``QsubBatchSettings`` are used to configure jobs that should be launched as a batch on PBSPro systems. @@ -333,12 +336,12 @@ be launched as a batch on PBSPro systems. :members: +.. _cqsub_api: + CobaltBatchSettings ------------------- -.. _cqsub_api: - ``CobaltBatchSettings`` are used to configure jobs that should be launched as a batch on Cobalt Systems. They closely mimic that of the ``QsubBatchSettings`` for PBSPro. @@ -359,11 +362,11 @@ that of the ``QsubBatchSettings`` for PBSPro. :members: +.. _bsub_api: BsubBatchSettings ----------------- -.. _bsub_api: ``BsubBatchSettings`` are used to configure jobs that should be launched as a batch on LSF systems. @@ -386,11 +389,11 @@ be launched as a batch on LSF systems. :undoc-members: :members: +.. _singularity_api: Singularity ----------- -.. _singularity_api: ``Singularity`` is a type of ``Container`` that can be passed to a ``RunSettings`` class or child class to enable running the workload in a @@ -407,11 +410,11 @@ Orchestrator .. currentmodule:: smartsim.database +.. _orc_api: Orchestrator ------------ -.. _orc_api: .. autoclass:: Orchestrator :members: @@ -446,11 +449,11 @@ Model :show-inheritance: :inherited-members: +.. _ensemble_api: Ensemble ======== -.. _ensemble_api: .. currentmodule:: smartsim.entity.ensemble @@ -472,31 +475,27 @@ Ensemble :show-inheritance: :inherited-members: +.. _ml_api: Machine Learning ================ -.. _ml_api: SmartSim includes built-in utilities for supporting TensorFlow, Keras, and Pytorch. +.. _smartsim_tf_api: + TensorFlow ---------- -.. _smartsim_tf_api: SmartSim includes built-in utilities for supporting TensorFlow and Keras in training and inference. .. currentmodule:: smartsim.ml.tf.utils -.. autosummary:: - - freeze_model - .. automodule:: smartsim.ml.tf.utils :members: - .. currentmodule:: smartsim.ml.tf .. autoclass:: StaticDataGenerator @@ -509,11 +508,11 @@ SmartSim includes built-in utilities for supporting TensorFlow and Keras in trai :show-inheritance: :inherited-members: +.. _smartsim_torch_api: + PyTorch ---------- -.. _smartsim_torch_api: - SmartSim includes built-in utilities for supporting PyTorch in training and inference. .. currentmodule:: smartsim.ml.torch @@ -533,11 +532,11 @@ SmartSim includes built-in utilities for supporting PyTorch in training and infe :show-inheritance: :inherited-members: +.. _slurm_module_api: + Slurm ===== -.. _slurm_module_api: - .. currentmodule:: smartsim.slurm @@ -548,4 +547,3 @@ Slurm .. automodule:: smartsim.slurm :members: - diff --git a/doc/changelog.rst b/doc/changelog.rst index 375e1f17e..befb9ee37 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -5,13 +5,88 @@ Changelog Listed here are the changes between each release of SmartSim and SmartRedis. -Jump to :ref:`SmartRedis Changelog ` +Jump to :ref:`SmartRedis Changelog ` SmartSim ======== +Development branch +------------------ + +To be released at some future point in time + + +0.6.0 +----- + +Released on 18 December, 2023 + +Description + +- Conflicting directives in the SmartSim packaging instructions were fixed +- `sacct` and `sstat` errors are now fatal for Slurm-based workflow executions +- Added documentation section about ML features and TorchScript +- Added TorchScript functions to Online Analysis tutorial +- Added multi-DB example to documentation +- Improved test stability on HPC systems +- Added support for producing & consuming telemetry outputs +- Split tests into groups for parallel execution in CI/CD pipeline +- Change signature of `Experiment.summary()` +- Expose first_device parameter for scripts, functions, models +- Added support for MINBATCHTIMEOUT in model execution +- Remove support for RedisAI 1.2.5, use RedisAI 1.2.7 commit +- Add support for multiple databases + +Detailed Notes + +- Several conflicting directives between the `setup.py` and the `setup.cfg` were fixed + to mitigate warnings issued when building the pip wheel. (SmartSim-PR435_) +- When the Slurm functions `sacct` and `sstat` returned an error, it would be ignored + and SmartSim's state could become inconsistent. To prevent this, errors + raised by `sacct` or `sstat` now result in an exception. (SmartSim-PR392_) +- A section named *ML Features* was added to documentation. It contains multiple + examples of how ML models and functions can be added to and executed on the DB. + TorchScript-based post-processing was added to the *Online Analysis* tutorial (SmartSim-PR411_) +- An example of how to use multiple Orchestrators concurrently was added to the documentation (SmartSim-PR409_) +- The test infrastructure was improved. Tests on HPC system are now stable, and issues such + as non-stopped `Orchestrators` or experiments created in the wrong paths have been fixed (SmartSim-PR381_) +- A telemetry monitor was added to check updates and produce events for SmartDashboard (SmartSim-PR426_) +- Split tests into `group_a`, `group_b`, `slow_tests` for parallel execution in CI/CD pipeline (SmartSim-PR417_, SmartSim-PR424_) +- Change `format` argument to `style` in `Experiment.summary()`, this is + an API break (SmartSim-PR391_) +- Added support for first_device parameter for scripts, functions, + and models. This causes them to be loaded to the first num_devices + beginning with first_device (SmartSim-PR394_) +- Added support for MINBATCHTIMEOUT in model execution, which caps the delay + waiting for a minimium number of model execution operations to accumulate + before executing them as a batch (SmartSim-PR387_) +- RedisAI 1.2.5 is not supported anymore. The only RedisAI version + is now 1.2.7. Since the officially released RedisAI 1.2.7 has a + bug which breaks the build process on Mac OSX, it was decided to + use commit 634916c_ from RedisAI's GitHub repository, where such + bug has been fixed. This applies to all operating systems. (SmartSim-PR383_) +- Add support for creation of multiple databases with unique identifiers. (SmartSim-PR342_) + + +.. _SmartSim-PR435: https://github.com/CrayLabs/SmartSim/pull/435 +.. _SmartSim-PR392: https://github.com/CrayLabs/SmartSim/pull/392 +.. _SmartSim-PR411: https://github.com/CrayLabs/SmartSim/pull/411 +.. _SmartSim-PR409: https://github.com/CrayLabs/SmartSim/pull/409 +.. _SmartSim-PR381: https://github.com/CrayLabs/SmartSim/pull/381 +.. _SmartSim-PR426: https://github.com/CrayLabs/SmartSim/pull/426 +.. _SmartSim-PR424: https://github.com/CrayLabs/SmartSim/pull/424 +.. _SmartSim-PR417: https://github.com/CrayLabs/SmartSim/pull/417 +.. _SmartSim-PR391: https://github.com/CrayLabs/SmartSim/pull/391 +.. _SmartSim-PR342: https://github.com/CrayLabs/SmartSim/pull/342 +.. _SmartSim-PR394: https://github.com/CrayLabs/SmartSim/pull/394 +.. _SmartSim-PR387: https://github.com/CrayLabs/SmartSim/pull/387 +.. _SmartSim-PR383: https://github.com/CrayLabs/SmartSim/pull/383 +.. _634916c: https://github.com/RedisAI/RedisAI/commit/634916c722e718cc6ea3fad46e63f7d798f9adc2 +.. _SmartSim-PR342: https://github.com/CrayLabs/SmartSim/pull/342 + + 0.5.1 ----- @@ -36,53 +111,53 @@ Description Detailed Notes -- Add methods to allow users to inspect files attached to models and ensembles. (PR352_) -- Add a `smart info` target to provide rudimentary information about the SmartSim installation. (PR350_) -- Remove unnecessary generation producing unexpected directories in the test suite. (PR349_) -- Add support for heterogeneous jobs to `SrunSettings` by allowing users to set the `--het-group` parameter. (PR346_) -- Provide clearer guidelines on how to contribute to SmartSim. (PR344_) -- Integrate `PalsMpiexecSettings` into the `Experiment` factory methods when using the `"pals"` launcher. (PR343_) -- Create public properties where appropriate to mitigate `protected-access` errors. (PR341_) -- Fix a failure to execute `_prep_colocated_db` due to incorrect named attr check. (PR339_) -- Enabled and mitigated mypy `disallow_any_generics` and `warn_return_any`. (PR338_) -- Add a `smart validate` target to provide a simple smoke test to assess a SmartSim build. (PR336_, PR351_) -- Add typehints to `smartsim._core.launcher.step.*`. (PR334_) -- Log errors reported from slurm WLM when attempts to retrieve status fail. (PR331_, PR332_) -- Fix incorrectly formatted positional arguments in log format strings. (PR330_) -- Ensure that launchers pass environment variables to unmanaged job steps. (PR329_) -- Add additional tests surrounding the `RAI_PATH` configuration environment variable. (PR328_) -- Remove unnecessary execution of unescaped shell commands. (PR327_) -- Add error if user calls get_allocation with reserved keywords in slurm get_allocation. (PR325_) -- Add error when user requests CPU with devices greater than 1 within add_ml_model and add_script. (PR324_) -- Update documentation surrounding ensemble key prefixing. (PR322_) -- Fix formatting of the Frontier site installation. (PR321_) -- Update pylint dependency, update .pylintrc, mitigate non-breaking issues, suppress api breaks. (PR311_) -- Refactor the `smart` CLI to use subparsers for better documentation and extension. (PR308_) - -.. _PR352: https://github.com/CrayLabs/SmartSim/pull/352 -.. _PR351: https://github.com/CrayLabs/SmartSim/pull/351 -.. _PR350: https://github.com/CrayLabs/SmartSim/pull/350 -.. _PR349: https://github.com/CrayLabs/SmartSim/pull/349 -.. _PR346: https://github.com/CrayLabs/SmartSim/pull/346 -.. _PR344: https://github.com/CrayLabs/SmartSim/pull/344 -.. _PR343: https://github.com/CrayLabs/SmartSim/pull/343 -.. _PR341: https://github.com/CrayLabs/SmartSim/pull/341 -.. _PR339: https://github.com/CrayLabs/SmartSim/pull/339 -.. _PR338: https://github.com/CrayLabs/SmartSim/pull/338 -.. _PR336: https://github.com/CrayLabs/SmartSim/pull/336 -.. _PR334: https://github.com/CrayLabs/SmartSim/pull/334 -.. _PR332: https://github.com/CrayLabs/SmartSim/pull/332 -.. _PR331: https://github.com/CrayLabs/SmartSim/pull/331 -.. _PR330: https://github.com/CrayLabs/SmartSim/pull/330 -.. _PR329: https://github.com/CrayLabs/SmartSim/pull/329 -.. _PR328: https://github.com/CrayLabs/SmartSim/pull/328 -.. _PR327: https://github.com/CrayLabs/SmartSim/pull/327 -.. _PR325: https://github.com/CrayLabs/SmartSim/pull/325 -.. _PR324: https://github.com/CrayLabs/SmartSim/pull/324 -.. _PR322: https://github.com/CrayLabs/SmartSim/pull/322 -.. _PR321: https://github.com/CrayLabs/SmartSim/pull/321 -.. _PR311: https://github.com/CrayLabs/SmartSim/pull/311 -.. _PR308: https://github.com/CrayLabs/SmartSim/pull/308 +- Add methods to allow users to inspect files attached to models and ensembles. (SmartSim-PR352_) +- Add a `smart info` target to provide rudimentary information about the SmartSim installation. (SmartSim-PR350_) +- Remove unnecessary generation producing unexpected directories in the test suite. (SmartSim-PR349_) +- Add support for heterogeneous jobs to `SrunSettings` by allowing users to set the `--het-group` parameter. (SmartSim-PR346_) +- Provide clearer guidelines on how to contribute to SmartSim. (SmartSim-PR344_) +- Integrate `PalsMpiexecSettings` into the `Experiment` factory methods when using the `"pals"` launcher. (SmartSim-PR343_) +- Create public properties where appropriate to mitigate `protected-access` errors. (SmartSim-PR341_) +- Fix a failure to execute `_prep_colocated_db` due to incorrect named attr check. (SmartSim-PR339_) +- Enabled and mitigated mypy `disallow_any_generics` and `warn_return_any`. (SmartSim-PR338_) +- Add a `smart validate` target to provide a simple smoke test to assess a SmartSim build. (SmartSim-PR336_, SmartSim-PR351_) +- Add typehints to `smartsim._core.launcher.step.*`. (SmartSim-PR334_) +- Log errors reported from slurm WLM when attempts to retrieve status fail. (SmartSim-PR331_, SmartSim-PR332_) +- Fix incorrectly formatted positional arguments in log format strings. (SmartSim-PR330_) +- Ensure that launchers pass environment variables to unmanaged job steps. (SmartSim-PR329_) +- Add additional tests surrounding the `RAI_PATH` configuration environment variable. (SmartSim-PR328_) +- Remove unnecessary execution of unescaped shell commands. (SmartSim-PR327_) +- Add error if user calls get_allocation with reserved keywords in slurm get_allocation. (SmartSim-PR325_) +- Add error when user requests CPU with devices greater than 1 within add_ml_model and add_script. (SmartSim-PR324_) +- Update documentation surrounding ensemble key prefixing. (SmartSim-PR322_) +- Fix formatting of the Frontier site installation. (SmartSim-PR321_) +- Update pylint dependency, update .pylintrc, mitigate non-breaking issues, suppress api breaks. (SmartSim-PR311_) +- Refactor the `smart` CLI to use subparsers for better documentation and extension. (SmartSim-PR308_) + +.. _SmartSim-PR352: https://github.com/CrayLabs/SmartSim/pull/352 +.. _SmartSim-PR351: https://github.com/CrayLabs/SmartSim/pull/351 +.. _SmartSim-PR350: https://github.com/CrayLabs/SmartSim/pull/350 +.. _SmartSim-PR349: https://github.com/CrayLabs/SmartSim/pull/349 +.. _SmartSim-PR346: https://github.com/CrayLabs/SmartSim/pull/346 +.. _SmartSim-PR344: https://github.com/CrayLabs/SmartSim/pull/344 +.. _SmartSim-PR343: https://github.com/CrayLabs/SmartSim/pull/343 +.. _SmartSim-PR341: https://github.com/CrayLabs/SmartSim/pull/341 +.. _SmartSim-PR339: https://github.com/CrayLabs/SmartSim/pull/339 +.. _SmartSim-PR338: https://github.com/CrayLabs/SmartSim/pull/338 +.. _SmartSim-PR336: https://github.com/CrayLabs/SmartSim/pull/336 +.. _SmartSim-PR334: https://github.com/CrayLabs/SmartSim/pull/334 +.. _SmartSim-PR332: https://github.com/CrayLabs/SmartSim/pull/332 +.. _SmartSim-PR331: https://github.com/CrayLabs/SmartSim/pull/331 +.. _SmartSim-PR330: https://github.com/CrayLabs/SmartSim/pull/330 +.. _SmartSim-PR329: https://github.com/CrayLabs/SmartSim/pull/329 +.. _SmartSim-PR328: https://github.com/CrayLabs/SmartSim/pull/328 +.. _SmartSim-PR327: https://github.com/CrayLabs/SmartSim/pull/327 +.. _SmartSim-PR325: https://github.com/CrayLabs/SmartSim/pull/325 +.. _SmartSim-PR324: https://github.com/CrayLabs/SmartSim/pull/324 +.. _SmartSim-PR322: https://github.com/CrayLabs/SmartSim/pull/322 +.. _SmartSim-PR321: https://github.com/CrayLabs/SmartSim/pull/321 +.. _SmartSim-PR311: https://github.com/CrayLabs/SmartSim/pull/311 +.. _SmartSim-PR308: https://github.com/CrayLabs/SmartSim/pull/308 0.5.0 @@ -112,70 +187,70 @@ A full list of changes and detailed notes can be found below: Detailed notes -- Updates SmartRedis to the most current release (PR316_) -- Fixes and enhancements to documentation (PR317_, PR314_, PR287_) -- Various fixes and enhancements to the test suite (PR315_, PR312_, PR310_, PR302_, PR283_) +- Updates SmartRedis to the most current release (SmartSim-PR316_) +- Fixes and enhancements to documentation (SmartSim-PR317_, SmartSim-PR314_, SmartSim-PR287_) +- Various fixes and enhancements to the test suite (SmartSim-PR315_, SmartSim-PR312_, SmartSim-PR310_, SmartSim-PR302_, SmartSim-PR283_) - Fix a defect in the tests related to database models and scripts that was - causing key collisions when testing on workload managers (PR313_) -- Remove `requirements.txt` and other places where dependencies were defined. (PR307_) + causing key collisions when testing on workload managers (SmartSim-PR313_) +- Remove `requirements.txt` and other places where dependencies were defined. (SmartSim-PR307_) - Fix defect where dictionaries used to create run settings can be changed - unexpectedly due to copy-by-ref (PR305_) + unexpectedly due to copy-by-ref (SmartSim-PR305_) - The underlying code for Model.add_ml_model() and Model.add_script() was fixed to correctly handle multi-GPU configurations. Tests were updated to run on non-local launchers. Documentation was updated and fixed. Also, the default - testing interface has been changed to lo instead of ipogif. (PR304_) + testing interface has been changed to lo instead of ipogif. (SmartSim-PR304_) - Typehints have been added. A makefile target `make check-mypy` executes static - analysis with mypy. (PR295_, PR301_, PR303_) + analysis with mypy. (SmartSim-PR295_, SmartSim-PR301_, SmartSim-PR303_) - Replace `limit_app_cpus` with `limit_db_cpus` for co-located orchestrators. This resolves some incorrect behavior/assumptions about how the application would be pinned. Instead, users should directly specify the binding options in - their application using the options appropriate for their launcher (PR306_) -- Simplify code in `random_permutations` parameter generation strategy (PR300_) -- Remove wait time associated with Experiment launch summary (PR298_) -- Update Redis conf file to conform with Redis v7.0.5 conf file (PR293_) -- Migrate from redis-py-cluster to redis-py for cluster status checks (PR292_) -- Update full test suite to no longer require a tensorflow wheel to be available at test time. (PR291_) -- Correct spelling of colocated in doc strings (PR290_) + their application using the options appropriate for their launcher (SmartSim-PR306_) +- Simplify code in `random_permutations` parameter generation strategy (SmartSim-PR300_) +- Remove wait time associated with Experiment launch summary (SmartSim-PR298_) +- Update Redis conf file to conform with Redis v7.0.5 conf file (SmartSim-PR293_) +- Migrate from redis-py-cluster to redis-py for cluster status checks (SmartSim-PR292_) +- Update full test suite to no longer require a tensorflow wheel to be available at test time. (SmartSim-PR291_) +- Correct spelling of colocated in doc strings (SmartSim-PR290_) - Deprecated launcher-specific orchestrators, constants, and ML - utilities were removed. (PR289_) -- Relax the coloredlogs version to be greater than 10.0 (PR288_) + utilities were removed. (SmartSim-PR289_) +- Relax the coloredlogs version to be greater than 10.0 (SmartSim-PR288_) - Update the Github Actions runner image from `macos-10.15`` to `macos-12``. The - former began deprecation in May 2022 and was finally removed in May 2023. (PR285_) + former began deprecation in May 2022 and was finally removed in May 2023. (SmartSim-PR285_) - The Fortran tutorials had not been fully updated to show how to handle - return/error codes. These have now all been updated. (PR284_) + return/error codes. These have now all been updated. (SmartSim-PR284_) - Orchestrator and Colocated DB now accept a list of interfaces to bind to. The - argument name is still `interface` for backward compatibility reasons. (PR281_) + argument name is still `interface` for backward compatibility reasons. (SmartSim-PR281_) - Typehints have been added to public APIs. A makefile target to execute static - analysis with mypy is available `make check-mypy`. (PR295_) - -.. _PR317: https://github.com/CrayLabs/SmartSim/pull/317 -.. _PR316: https://github.com/CrayLabs/SmartSim/pull/316 -.. _PR315: https://github.com/CrayLabs/SmartSim/pull/314 -.. _PR314: https://github.com/CrayLabs/SmartSim/pull/314 -.. _PR313: https://github.com/CrayLabs/SmartSim/pull/313 -.. _PR312: https://github.com/CrayLabs/SmartSim/pull/312 -.. _PR310: https://github.com/CrayLabs/SmartSim/pull/310 -.. _PR307: https://github.com/CrayLabs/SmartSim/pull/307 -.. _PR306: https://github.com/CrayLabs/SmartSim/pull/306 -.. _PR305: https://github.com/CrayLabs/SmartSim/pull/305 -.. _PR304: https://github.com/CrayLabs/SmartSim/pull/304 -.. _PR303: https://github.com/CrayLabs/SmartSim/pull/303 -.. _PR302: https://github.com/CrayLabs/SmartSim/pull/302 -.. _PR301: https://github.com/CrayLabs/SmartSim/pull/301 -.. _PR300: https://github.com/CrayLabs/SmartSim/pull/300 -.. _PR298: https://github.com/CrayLabs/SmartSim/pull/298 -.. _PR295: https://github.com/CrayLabs/SmartSim/pull/295 -.. _PR293: https://github.com/CrayLabs/SmartSim/pull/293 -.. _PR292: https://github.com/CrayLabs/SmartSim/pull/292 -.. _PR291: https://github.com/CrayLabs/SmartSim/pull/291 -.. _PR290: https://github.com/CrayLabs/SmartSim/pull/290 -.. _PR289: https://github.com/CrayLabs/SmartSim/pull/289 -.. _PR288: https://github.com/CrayLabs/SmartSim/pull/288 -.. _PR287: https://github.com/CrayLabs/SmartSim/pull/287 -.. _PR285: https://github.com/CrayLabs/SmartSim/pull/285 -.. _PR284: https://github.com/CrayLabs/SmartSim/pull/284 -.. _PR283: https://github.com/CrayLabs/SmartSim/pull/283 -.. _PR281: https://github.com/CrayLabs/SmartSim/pull/281 + analysis with mypy is available `make check-mypy`. (SmartSim-PR295_) + +.. _SmartSim-PR317: https://github.com/CrayLabs/SmartSim/pull/317 +.. _SmartSim-PR316: https://github.com/CrayLabs/SmartSim/pull/316 +.. _SmartSim-PR315: https://github.com/CrayLabs/SmartSim/pull/314 +.. _SmartSim-PR314: https://github.com/CrayLabs/SmartSim/pull/314 +.. _SmartSim-PR313: https://github.com/CrayLabs/SmartSim/pull/313 +.. _SmartSim-PR312: https://github.com/CrayLabs/SmartSim/pull/312 +.. _SmartSim-PR310: https://github.com/CrayLabs/SmartSim/pull/310 +.. _SmartSim-PR307: https://github.com/CrayLabs/SmartSim/pull/307 +.. _SmartSim-PR306: https://github.com/CrayLabs/SmartSim/pull/306 +.. _SmartSim-PR305: https://github.com/CrayLabs/SmartSim/pull/305 +.. _SmartSim-PR304: https://github.com/CrayLabs/SmartSim/pull/304 +.. _SmartSim-PR303: https://github.com/CrayLabs/SmartSim/pull/303 +.. _SmartSim-PR302: https://github.com/CrayLabs/SmartSim/pull/302 +.. _SmartSim-PR301: https://github.com/CrayLabs/SmartSim/pull/301 +.. _SmartSim-PR300: https://github.com/CrayLabs/SmartSim/pull/300 +.. _SmartSim-PR298: https://github.com/CrayLabs/SmartSim/pull/298 +.. _SmartSim-PR295: https://github.com/CrayLabs/SmartSim/pull/295 +.. _SmartSim-PR293: https://github.com/CrayLabs/SmartSim/pull/293 +.. _SmartSim-PR292: https://github.com/CrayLabs/SmartSim/pull/292 +.. _SmartSim-PR291: https://github.com/CrayLabs/SmartSim/pull/291 +.. _SmartSim-PR290: https://github.com/CrayLabs/SmartSim/pull/290 +.. _SmartSim-PR289: https://github.com/CrayLabs/SmartSim/pull/289 +.. _SmartSim-PR288: https://github.com/CrayLabs/SmartSim/pull/288 +.. _SmartSim-PR287: https://github.com/CrayLabs/SmartSim/pull/287 +.. _SmartSim-PR285: https://github.com/CrayLabs/SmartSim/pull/285 +.. _SmartSim-PR284: https://github.com/CrayLabs/SmartSim/pull/284 +.. _SmartSim-PR283: https://github.com/CrayLabs/SmartSim/pull/283 +.. _SmartSim-PR281: https://github.com/CrayLabs/SmartSim/pull/281 0.4.2 ----- @@ -211,38 +286,38 @@ Detailed Notes - Running some tests could result in some SmartSim-specific environment variables to be set. Such environment variables are now reset after each test execution. Also, a warning for environment variable usage in Slurm was added, to make the user aware in case an environment - variable will not be assigned the desired value with `--export`. (PR270_) -- The PyTorch and TensorFlow data loaders were update to make use of aggregation lists. This breaks their API, but makes them easier to use. (PR264_) + variable will not be assigned the desired value with `--export`. (SmartSim-PR270_) +- The PyTorch and TensorFlow data loaders were update to make use of aggregation lists. This breaks their API, but makes them easier to use. (SmartSim-PR264_) - The support for Ray was dropped, as its most recent versions caused problems when deployed through SmartSim. We plan to release a separate add-on library to accomplish the same results. If - you are interested in getting the Ray launch functionality back in your workflow, please get in touch with us! (PR263_) -- Update from Redis version 6.0.8 to 7.0.5. (PR258_) + you are interested in getting the Ray launch functionality back in your workflow, please get in touch with us! (SmartSim-PR263_) +- Update from Redis version 6.0.8 to 7.0.5. (SmartSim-PR258_) - Adds support for Python 3.10 without the ONNX machine learning backend. Deprecates support for Python 3.7 as it will stop receiving security updates. Deprecates support for RedisAI 1.2.3. Update the build process to be able to correctly fetch supported dependencies. If a user attempts to build an unsupported dependency, an error message is shown highlighting the - discrepancy. (PR256_) + discrepancy. (SmartSim-PR256_) - Models were given a `batch_settings` attribute. When launching a model through `Experiment.start` the `Experiment` will first check for a non-nullish value at that attribute. If the check is satisfied, the `Experiment` will attempt to wrap the underlying run command in a batch job using the object referenced at `Model.batch_settings` as the batch settings for the job. If the check - is not satisfied, the `Model` is launched in the traditional manner as a job step. (PR245_) -- Fix bug in colocated database entrypoint stemming from uninitialized variables. This bug affects PyTorch models being loaded into the database. (PR237_) -- The release of RedisAI 1.2.7 allows us to update support for recent versions of PyTorch, Tensorflow, and ONNX (PR234_) + is not satisfied, the `Model` is launched in the traditional manner as a job step. (SmartSim-PR245_) +- Fix bug in colocated database entrypoint stemming from uninitialized variables. This bug affects PyTorch models being loaded into the database. (SmartSim-PR237_) +- The release of RedisAI 1.2.7 allows us to update support for recent versions of PyTorch, Tensorflow, and ONNX (SmartSim-PR234_) - Make installation of correct Torch backend more reliable according to instruction from PyTorch - In addition to TCP, add UDS support for colocating an orchestrator with models. Methods `Model.colocate_db_tcp` and `Model.colocate_db_uds` were added to expose this functionality. - The `Model.colocate_db` method remains and uses TCP for backward compatibility (PR246_) + The `Model.colocate_db` method remains and uses TCP for backward compatibility (SmartSim-PR246_) -.. _PR270: https://github.com/CrayLabs/SmartSim/pull/270 -.. _PR264: https://github.com/CrayLabs/SmartSim/pull/264 -.. _PR263: https://github.com/CrayLabs/SmartSim/pull/263 -.. _PR258: https://github.com/CrayLabs/SmartSim/pull/258 -.. _PR256: https://github.com/CrayLabs/SmartSim/pull/256 -.. _PR246: https://github.com/CrayLabs/SmartSim/pull/246 -.. _PR245: https://github.com/CrayLabs/SmartSim/pull/245 -.. _PR237: https://github.com/CrayLabs/SmartSim/pull/237 -.. _PR234: https://github.com/CrayLabs/SmartSim/pull/234 +.. _SmartSim-PR270: https://github.com/CrayLabs/SmartSim/pull/270 +.. _SmartSim-PR264: https://github.com/CrayLabs/SmartSim/pull/264 +.. _SmartSim-PR263: https://github.com/CrayLabs/SmartSim/pull/263 +.. _SmartSim-PR258: https://github.com/CrayLabs/SmartSim/pull/258 +.. _SmartSim-PR256: https://github.com/CrayLabs/SmartSim/pull/256 +.. _SmartSim-PR246: https://github.com/CrayLabs/SmartSim/pull/246 +.. _SmartSim-PR245: https://github.com/CrayLabs/SmartSim/pull/245 +.. _SmartSim-PR237: https://github.com/CrayLabs/SmartSim/pull/237 +.. _SmartSim-PR234: https://github.com/CrayLabs/SmartSim/pull/234 0.4.1 @@ -487,10 +562,10 @@ Description: --------------------------------------------------------------- +.. _sr_changelog: + SmartRedis ========== -.. _changelog: - .. include:: ../smartredis/doc/changelog.rst :start-line: 3 diff --git a/doc/code_of_conduct.rst b/doc/code_of_conduct.rst index 1ba818875..77d278b66 100644 --- a/doc/code_of_conduct.rst +++ b/doc/code_of_conduct.rst @@ -24,30 +24,33 @@ as Zoom, Teams, Google Meet, etc. * **Be welcoming, friendly, and patient.** * **Be considerate.** -Your work will be used by other people, and you in turn will depend on the work -of others. Any decision you make will affect users and colleagues, and you -should take those consequences into account when making decisions. + Your work will be used by other people, and you in turn will depend on the work + of others. Any decision you make will affect users and colleagues, and you + should take those consequences into account when making decisions. + * **Be respectful.** -Not all of us will agree all the time, but disagreement is no excuse for poor -behaviour and poor manners. We might all experience some frustration now and -then, but we cannot allow that frustration to turn into a personal attack. It is -important to remember that a community where people feel uncomfortable or -threatened is not a productive one. Members of the SmartSim community should be -respectful when dealing with other members as well as with people outside the -SmartSim community. + Not all of us will agree all the time, but disagreement is no excuse for poor + behaviour and poor manners. We might all experience some frustration now and + then, but we cannot allow that frustration to turn into a personal attack. It is + important to remember that a community where people feel uncomfortable or + threatened is not a productive one. Members of the SmartSim community should be + respectful when dealing with other members as well as with people outside the + SmartSim community. + * **Be careful in the words that you choose.** -Sexist, racist, and other exclusionary jokes and comments can be offensive to -those around you. Be kind to others. Do not insult or put down other -participants. Behave professionally. Remember that harassment and sexist, -racist, or exclusionary jokes are not appropriate for the community. + Sexist, racist, and other exclusionary jokes and comments can be offensive to + those around you. Be kind to others. Do not insult or put down other + participants. Behave professionally. Remember that harassment and sexist, + racist, or exclusionary jokes are not appropriate for the community. + * **When we disagree, we try to understand why.** -Disagreements, both social and technical, are a natural part of collaborative -development. It is important that we resolve disagreements and differing views -constructively. Remember that we are different. The strength of SmartSim comes -from its varied community, people from a wide range of backgrounds. Different -people have different perspectives on issues. Being unable to understand why -someone holds a viewpoint does not mean that they are wrong. Do not forget that -it is human to err and blaming each other does not get us anywhere. Rather, -offer to help resolve issues and to help learn from mistakes. + Disagreements, both social and technical, are a natural part of collaborative + development. It is important that we resolve disagreements and differing views + constructively. Remember that we are different. The strength of SmartSim comes + from its varied community, people from a wide range of backgrounds. Different + people have different perspectives on issues. Being unable to understand why + someone holds a viewpoint does not mean that they are wrong. Do not forget that + it is human to err and blaming each other does not get us anywhere. Rather, + offer to help resolve issues and to help learn from mistakes. Original text courtesy of the `Speak Up! project `_. \ No newline at end of file diff --git a/doc/conf.py b/doc/conf.py index 637e6cba6..908b9534f 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -10,6 +10,8 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # +# pylint: skip-file + import os import sys sys.path.insert(0, os.path.abspath('.')) @@ -24,7 +26,7 @@ import smartsim version = smartsim.__version__ except ImportError: - version = "0.5.1" + version = "0.6.0" # The full version, including alpha/beta/rc tags release = version @@ -37,6 +39,7 @@ # ones. extensions = [ 'sphinx.ext.autodoc', + 'sphinx.ext.autosectionlabel', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.imgmath', @@ -47,9 +50,12 @@ 'sphinxfortran.fortran_domain', 'sphinxfortran.fortran_autodoc', 'breathe', - 'nbsphinx' + 'nbsphinx', + 'sphinx_copybutton', + 'sphinx_tabs.tabs' ] +suppress_warnings = ['autosectionlabel'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -82,10 +88,16 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +slack_invite ="https://join.slack.com/t/craylabs/shared_invite/zt-nw3ag5z5-5PS4tIXBfufu1bIvvr71UA" +extra_footer = ('Questions? You can contact contact us or ' + f'join us on Slack!' + ) + html_theme_options = { "repository_url": "https://github.com/CrayLabs/SmartSim", "use_repository_button": True, "use_issues_button": True, + "extra_footer": extra_footer, } autoclass_content = 'both' diff --git a/doc/contributing.rst b/doc/contributing.rst index cb7965269..a8a860045 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -1,5 +1,5 @@ -********* +****************** Contributing Guide ****************** @@ -48,30 +48,30 @@ How to contribute We welcome contributions from the broader community. These generally fall under two categories: -- Developments to the main codebases: These contributions directly modify the -SmartSim and SmartRedis repositories to fix bugs in the code, improve -the documentation, or add new features. Before embarking on major development, -please contact the developers first to make sure that your proposal aligns -with the internal development by opening an issue or reach out to us. Please -follow the :ref:`Developer Guidelines `. +- Contributions to the main codebases: These contributions directly modify the + SmartSim and SmartRedis repositories to fix bugs in the code, improve + the documentation, or add new features. Before embarking on major development, + please contact the developers first to make sure that your proposal aligns + with the internal development by opening an issue or reaching out to us. Please + follow the :ref:`Developer Guidelines `. - Applications of SmartSim: Many of our users build scientific/engineering -applications using SmartSim and SmartRedis. Please feel free to make -contributions to the :ref:`SmartSim Zoo `. + applications using SmartSim and SmartRedis. Please feel free to make + contributions to the :ref:`SmartSim Zoo `. In both cases, contributors can expect the following: - A quick response to your pull requests detailing why we are accepting or -rejecting your contribution + rejecting your contribution. - If we accept your contribution, a SmartSim developer will be assigned to help -bring your contribution into the codebases by testing it across a variety of -platforms and ensuring code quality. + bring your contribution into the codebases by testing it across a variety of + platforms and ensuring code quality. -- You will be credited as a co-author when the contribution is merged +- You will be credited as a co-author when the contribution is merged. .. note:: - The last point serves as a `Developer Certificate of Origin`_. + The last point serves as a `Developer Certificate of Origin `_. More specifically, we will ask contributors to signoff on the final state of the PR before merging. This signoff will then be propagated into the final squash merge commit. @@ -95,4 +95,3 @@ through GitHub. For issues related to `SmartSim post here 4. **Contact the Development Team** - For all other inquiries including collaboration opportunities, please contact SmartSim at hpe dot com. - diff --git a/doc/developer.rst b/doc/developer.rst index cff124b2e..4009819c3 100644 --- a/doc/developer.rst +++ b/doc/developer.rst @@ -1,9 +1,9 @@ +.. _developer_guide: ********* Developer ********* -.. _developer_guide: This section details common practices and tips for contributors to SmartSim and SmartRedis. diff --git a/doc/index.rst b/doc/index.rst index d61fdb1ce..91a7ee1ba 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -32,6 +32,7 @@ experiment orchestrator launchers + ml_features api/smartsim_api .. toctree:: @@ -46,8 +47,15 @@ sr_data_structures sr_dataset_conversions sr_runtime + sr_advanced_topics api/smartredis_api +.. toctree:: + :maxdepth: 2 + :caption: SmartDashboard + + smartdashboard + .. toctree:: :maxdepth: 2 :caption: Reference diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst index f18cc3ec6..3874eb961 100644 --- a/doc/installation_instructions/basic.rst +++ b/doc/installation_instructions/basic.rst @@ -33,7 +33,6 @@ The base prerequisites to install SmartSim and SmartRedis are: GCC 5-9, 11, and 12 is recommended. There are known bugs with GCC 10. -.. _GPU Support: GPU Support =========== @@ -80,17 +79,13 @@ Supported Versions Native support for various machine learning libraries and their -versions is dictated by our dependency on RedisAI_ 1.2.7. Users -can also select RedisAI 1.2.5 (though that also limits -the version of the ML libraries). +versions is dictated by our dependency on RedisAI_ 1.2.7. +------------------+----------+-------------+---------------+ | RedisAI | PyTorch | Tensorflow | ONNX Runtime | +==================+==========+=============+===============+ | 1.2.7 (default) | 1.11.0 | 2.8.0 | 1.11.1 | +------------------+----------+-------------+---------------+ -| 1.2.5 | 1.9.0 | 2.6.0 | 1.9.0 | -+------------------+----------+-------------+---------------+ TensorFlow_ 2.0 and Keras_ are supported through `graph freezing`_. @@ -304,5 +299,3 @@ Build the SmartRedis library ============================ .. include:: ../../smartredis/doc/install/lib.rst - - diff --git a/doc/installation_instructions/platform/olcf-summit.rst b/doc/installation_instructions/platform/olcf-summit.rst index bbc48f810..dfd7b9666 100644 --- a/doc/installation_instructions/platform/olcf-summit.rst +++ b/doc/installation_instructions/platform/olcf-summit.rst @@ -11,23 +11,28 @@ instructions to get a working SmartSim build with PyTorch and TensorFlow for GPU on Summit. Note that SmartSim and SmartRedis will be downloaded to the working directory from which these instructions are executed. +Note that the available PyTorch version (1.10.2) does not match +the one expected by RedisAI 1.2.7 (1.11): it is still compatible and should +work, but please open an issue on SmartSim's GitHub repo if you run +into problems. + .. code-block:: bash # setup Python and build environment - export ENV_NAME=smartsim-0.5.1 + export ENV_NAME=smartsim-0.6.0 git clone https://github.com/CrayLabs/SmartRedis.git smartredis git clone https://github.com/CrayLabs/SmartSim.git smartsim - conda config --prepend channels https://ftp.osuosl.org/pub/open-ce/1.4.1/ + conda config --prepend channels https://ftp.osuosl.org/pub/open-ce/1.6.1/ conda create --name $ENV_NAME -y python=3.9 \ git-lfs \ cmake \ make \ cudnn=8.1.1_11.2 \ cudatoolkit=11.2.2 \ - tensorflow=2.6.2 \ - libtensorflow=2.6.2 \ - pytorch=1.9.0 \ - torchvision=0.10.0 + tensorflow=2.8.1 \ + libtensorflow \ + pytorch=1.10.2 \ + torchvision=0.11.3 conda activate $ENV_NAME export CC=$(which gcc) export CXX=$(which g++) @@ -50,7 +55,7 @@ directory from which these instructions are executed. # install PyTorch and TensorFlow backend for the Orchestrator database. export Torch_DIR=/ccs/home/$USER/.conda/envs/$ENV_NAME/lib/python3.9/site-packages/torch/share/cmake/Torch/ export CFLAGS="$CFLAGS -I/ccs/home/$USER/.conda/envs/$ENV_NAME/lib/python3.9/site-packages/tensorflow/include" - export SMARTSIM_REDISAI=1.2.5 + export SMARTSIM_REDISAI=1.2.7 export Tensorflow_BUILD_DIR=/ccs/home/$USER/.conda/envs/$ENV_NAME/lib/python3.9/site-packages/tensorflow/ smart build --device=gpu --torch_dir $Torch_DIR --libtensorflow_dir $Tensorflow_BUILD_DIR -v diff --git a/doc/ml_features.rst b/doc/ml_features.rst new file mode 100644 index 000000000..51027e7ae --- /dev/null +++ b/doc/ml_features.rst @@ -0,0 +1,505 @@ +########### +ML Features +########### + +In this section, we illustrate features which +users are expected to use in HPC workloads, especially when +simulation and AI are required to interact. The topics are +explained through code snippets, +with code that goes beyond SmartSim and SmartRedis API +(e.g. code showing how to jit-script a PyTorch model): the +intention is that of showing *one* simple way of leveraging +a feature, but more optimized ways of using third-party +libraries may exist. + +Examples are written in Python, but the same +result can be achieved with any SmartRedis client (C, C++, +Fortran and Python). Please refer to SmartRedis API +for language-specific details. + +ML Model Deployment and Execution in the Database +=================================================== + +The combination of SmartSim and SmartRedis enables users +to store more than simple tensors on the database (DB). +In the upcoming subsections, we demonstrate how to use a +SmartRedis client to upload executable code, in the +form of ML model, scripts, and functions, to the DB. +Once store, the code can be executed using the SmartRedis client +methods and used to process tensors directly in the DB. +The tensors generated from running the stored code will also be stored +in the database and can be retrieved with standard SmartRedis ``Client.get_tensor()`` calls. + +SmartRedis offers two ways to upload serialized code +to the DB: from memory and from file. We will go through examples +demonstrating how to upload from each. We provide the following examples: + +- :ref:`TensorFlow and PyTorch `: Serialize a TensorFlow/Keras or PyTorch model, optionally + save it to file, upload it to the DB, then execute it on tensors stored on the DB. +- :ref:`TorchScript Functions `: Serialize TorchScript functions, optionally + save them to file, upload them to the DB, then execute them on tensors stored on the DB. +- :ref:`ONNX Runtime `: Convert a Scikit-Learn model to ONNX + format, upload it to the DB, then execute it on tensors stored on the DB. + + +.. note:: + In all examples, we will assume that a SmartSim ``Orchestator`` + is up and running, and that the code we will show is run as part + of a SmartSim-launched application ``Model``. + + +.. _ml_features_TF_PT: + +TensorFlow and PyTorch +---------------------- + +In this section, we will see how a TensorFlow/Keras or a PyTorch model +can be serialized using SmartSim's helper functions. +Once the model is serialized, we will use the SmartRedis client to upload it to the DB, +and execute it on data stored on the DB. +We will also see how the model can be optionally saved to file. The +workflow for TensorFlow and PyTorch is almost identical, but we provide +the code for each toolkit in a dedicated tab, for completeness. + +We begin by defining the ML model that we will use in both examples of +this section. + +.. tabs:: + + .. group-tab:: TensorFlow + + .. code-block:: python + + import numpy as np + from smartredis import Client + from tensorflow import keras + from smartsim.ml.tf import freeze_model + + model = keras.Sequential( + layers=[ + keras.layers.InputLayer(input_shape=(28, 28), name="input"), + keras.layers.Flatten(input_shape=(28, 28), name="flatten"), + keras.layers.Dense(128, activation="relu", name="dense"), + keras.layers.Dense(10, activation="softmax", name="output"), + ], + name="FCN", + ) + + # Compile model with optimizer + model.compile( + optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] + ) + + .. group-tab:: PyTorch + + .. code-block:: python + + import io + + import numpy as np + import torch + import torch.nn as nn + import torch.nn.functional as F + from smartredis import Client + + # simple MNIST in PyTorch + class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + # Instantiate the model + n = Net() + + +============================================================ +Serializing the model and uploading it to the DB from memory +============================================================ +Once the model is instantiated, it needs to be serialized to be uploaded +to the DB using the SmartRedis client. + +.. tabs:: + + .. group-tab:: TensorFlow + + As part of its :ref:`TensorFlow helper functions `, + SmartSim provides ``serialize_model()`` to serialize a TensorFlow or Keras + model. + + .. code-block:: python + + serialized_model, inputs, outputs = serialize_model(model) + + + Note that ``serialize_model()`` conveniently returns the model as bytestring + and the names of the input and output layers, which are now needed to upload the TensorFlow + model to the DB using ``Client.set_model()``. + We also use ``Client.put_tensor()`` to upload a batch of 20 synthetic MNIST samples to the DB. + + .. code-block:: python + + # Instantiate and connect SmartRedis client to communicate with DB + client = Client(cluster=False) + model_key = "mnist_cnn" + # Set device to CPU if GPU not available to DB + client.set_model( + model_key, serialized_model, "TF", device="GPU", inputs=inputs, outputs=outputs + ) + + + .. group-tab:: PyTorch + + PyTorch requires models to be `jit-traced `__. + The method ``torch.jit.save()`` can either store the model in memory or on file. Here, + we will keep it in memory as a bytestring. + + .. code-block:: python + + # Example input needed for jit tracing + example_forward_input = torch.rand(20, 1, 28, 28) + module = torch.jit.trace(n, mnist_images) + model_buffer = io.BytesIO() + torch.jit.save(module, model_buffer) + serialized_model = model_buffer.getvalue() + + Now that we have the serialized model, we can upload it to the DB using the ``Client.set_model()``. + + + We also use ``Client.put_tensor()`` to upload a batch of 20 synthetic MNIST samples to the DB. + + .. code-block:: python + + # Instantiate and connect SmartRedis client to communicate with DB + client = Client(cluster=False) + model_key = "mnist_cnn" + # Set device to CPU if GPU not available to DB + client.set_model(model_key, serialized_model, "TORCH", device="GPU") + + +For details about ``Client.set_model()``, please +refer to :ref:`SmartRedis API `. + + +===================================================== +Saving the model to a file and uploading it to the DB +===================================================== + +Once the model is compiled, it can be serialized and stored on the filesystem. This is +useful if the model has to be used at a later time. Once the model is saved to file, +it can be uploaded to the DB using the SmartRedis client. + +.. tabs:: + + .. group-tab:: TensorFlow + + As part of its :ref:`TensorFlow helper functions `, + SmartSim provides ``freeze_model()`` to serialize a TensorFlow or Keras + model and save it to file. In this example, the file will be named ``mnist.pb``. + + .. code-block:: python + + filename = "mnist.pb" + model_path, inputs, outputs = freeze_model(model, '.', filename) + + + Note that ``freeze_model()`` conveniently returns the path to the serialized model file, + and the names of the input and output layers, which are noew needed to upload the TensorFlow + model to the DB using ``Client.set_model_from_file()``. We also use + ``Client.put_tensor()`` to upload a synthetic MNIST sample to the DB. + + + .. code-block:: python + + client = Client(cluster=False) + model_key = "mnist_cnn" + client.set_model_from_file( + model_key, model_path, "TF", device="GPU", inputs=inputs, outputs=outputs + ) + + + .. group-tab:: PyTorch + + PyTorch requires models to be `jit-traced `__. + The method ``torch.jit.save()`` can either store the model in memory or on file. Here, + we will save it to a file located at ``./traced_model.pt``. + + .. code-block:: python + + # Example input needed for jit tracing + example_forward_input = torch.rand(20, 1, 28, 28) + module = torch.jit.trace(n, example_forward_input) + model_path = "./traced_model.pt" + torch.jit.save(module, modelpath) + + + Now that we have the serialized model, we can upload it to the DB using + ``Client.set_model_from_file()`` method. + + .. code-block:: python + + client = Client(cluster=False) + model_key = "mnist_cnn" + + client.set_model_from_file(model_key, model_path, "TORCH", device="CPU") + + +For details about ``Client.set_model_from_file()``, please +refer to :ref:`SmartRedis API `. + +=============================================== +Executing the model on tensors stored in the DB +=============================================== + +Now that the model is available for execution on the DB, we use the SmartRedis client +to upload a tensor representing a batch of 20 synthetic MNIST images. + +.. tabs:: + + .. group-tab:: TensorFlow + + .. code-block:: python + + # 20 samples of "image" data + mnist_images = np.random.rand(20, 28, 28, 1).astype(np.float32) + # client was instantiated previously + client.put_tensor("mnist_images", mnist_image) + + + .. group-tab:: PyTorch + + .. code-block:: python + + + # 20 samples of "image" data + mnist_images = torch.rand(20, 1, 28, 28) + # client was instantiated previously + client.put_tensor("mnist_images", mnist_images.numpy()) + + +Now we can use ``Client.run_model()`` to execute the model on the data we have +just stored and ``Client.get_tensor()`` to download the output of the model execution. +Notice that, for this part, the code is identical for models uploaded from file and from memory, and +with TensorFlow or PyTorch backends. + +.. code-block:: python + + client.run_model(model_key, inputs=["mnist_imagse"], outputs=["mnist_output"]) + output = client.get_tensor("mnist_output") + + +For details about ``Client.run_model()``, please +refer to :ref:`SmartRedis API `. + +.. _ml_features_torchscript: + +TorchScript Functions +--------------------- +Instead of Neural Networks, or, in general, Machine Learning models, it is +possible to upload to the DB (collections of) functions which can be used e.g. +to perform pre- or post-processing operations on tensors stored on the DB. + +Since the functions are going to be stored as TorchScript modules, they + +- need to be jit-traceable +- can use ``torch`` as a built-in module +- can **not** import modules + +In this section we will see how to + +- save a collection of functions to a script file, upload them to the DB, + and execute them on tensors stored on the DB. +- define and upload a function on-the-fly from a Python script and + execute it on tensors stored on the DB. + + +================================================================= +Uploading a script containing a collection of functions to the DB +================================================================= + +The easiest way of defining and storing functions on the DB is to create a +dedicated file. In that file, we can define functions which will be callable +through the SmartRedis client, but also from other functions in the +same file. A typical script file would look like this: + +.. code-block:: python + + def rescale(tensor, mu: float, sigma: float): + mean = tensor.mean() + std = tensor.std() + + normalized = (tensor-mean)/std + return tensor*sigma + mu + + def shift_y_to_x(x, y): + mu_x = x.mean() + sigma_x = x.std() + y_rescaled = rescale(y, mu_x, sigma_x) + + return y_rescaled + +In the script, we defined ``shift_y_to_x``, +a function which returns a modified copy of a tensor ``y``, +which matches the statistical distribution of the tensor ``x``. +Notice that we are not importing ``torch`` in the script, as it will +be recognized as a built-in by the TorchScript compiler. Because +of the discrepancy between TorchScript's and Python's syntaxes, TorchScript +scripts cannot be run as standalone Python scripts. + +Here is the code which allows us to run the function ``shift_y_to_x`` on +tensors stored in the DB. We will assume that the above script is stored +as ``"./shift.script"``. + +.. code-block:: python + + import numpy as np + from smartredis import Client + + # Generate tensors according to two different random distributions + x = np.random.rand(100, 100).astype(np.float32) + y = np.random.rand(100, 100).astype(np.float32) * 2 + 10 + + # Instantiate and connect SmartRedis client + client = Client(cluster=False) + + # Upload tensors to DB + client.put_tensor("X_rand", x) + client.put_tensor("Y_rand", y) + + # Upload script containing functions to DB + client.set_script_from_file("shifter", "./shift.script", device="CPU") + # Run the function ``shift_y_to_x`` on ``X_rand`` and ``Y_rand`` + client.run_script("shifter", "shift_y_to_x", inputs=["X_rand", "Y_rand"], outputs=["Y_scaled"]) + # Download output + y_scaled = client.get_tensor("Y_scaled") + + +In the above code, we used ``Client.put_tensor()`` to upload tensors to the DB, and +``Client.set_script_from_file()`` to upload the script containing the collection of functions. +We then used ``Client.run_script()`` to run the function ``shift_y_to_x`` on the stored +tensors, and downloaded the result with ``Client.get_tensor()``. + +For details about ``Client.set_script_from_file()`` and ``Client.run_script()``, please +refer to :ref:`SmartRedis API `. + + +========================================= +Uploading a function to the DB on-the-fly +========================================= + +Simpler functions (or functions that do not require calling other user-defined +or imported functions), can be defined inline and uploaded to the DB using the SmartRedis client. +For example: + +.. code-block:: python + + import numpy as np + from smartredis import Client + + def normalize(X): + """Simple function to normalize a tensor""" + mean = X.mean() + std = X.std() + + return (X-mean)/std + + # Generate random tensor + x = np.random.rand(100, 100).astype(np.float32) * 2 + 10 + + # Instantiate and connect SmartRedis client + client = Client(cluster=False) + + # Upload tensor to DB + client.put_tensor("X_rand", x) + + # Upload function to DB, ``normalizer`` is the name of the collection + # of functions containing the function ``normalize`` only. It mimics + # the way `set_script` works. + client.set_function("normalizer", normalize) + # Run the function ``normalize`` on ``X_rand`` + client.run_script("normalizer", "normalize", inputs=["X_rand"], outputs=["X_norm"]) + # Download output + x_norm = client.get_tensor("X_norm") + +Notice that the key ``"normalizer"`` represents the script containing the function (similar to +``"shifter"`` in the previous example), while the function name is ``"normalize"``. + + +For details about ``Client.set_function()`` and ``Client.run_script()``, please +refer to :ref:`SmartRedis API `. + +.. _ml_features_ONNX: + +ONNX Runtime +------------ + +In the following example, we will see how, thanks to the ONNX runtime, +Machine Learning and Data Analysis functions defined in +Scikit-Learn can be serialized and then put on the DB using the SmartRedis client. + +We start by defining a Scikit-Learn ``LinearRegression`` model and serialize it, +keeping it into memory. + +.. code-block:: python + + import numpy as np + from skl2onnx import to_onnx + from sklearn.linear_model import LinearRegression + from smartredis import Client + + def build_lin_reg(): + """Generates sklearn linear regression model and serialize it""" + x = np.array([[1.0], [2.0], [6.0], [4.0], [3.0], [5.0]]).astype(np.float32) + y = np.array([[2.0], [3.0], [7.0], [5.0], [4.0], [6.0]]).astype(np.float32) + + linreg = LinearRegression() + linreg.fit(x, y) + linreg = to_onnx(linreg, x.astype(np.float32), target_opset=13) + return linreg.SerializeToString() + + linreg = build_lin_reg() + +Once the model is serialized, we can use ``Client.set_model()`` to upload it +to the DB. + +.. code-block:: python + + # connect a client to the database + client = Client(cluster=False) + client.set_model("linreg", linreg, "ONNX", device="GPU") + + +Finally, we can upload a tensor to the DB using ``Client.put_tensor()``, run the +stored model on it using ``Client.run_model()``, and download the output calling +``Client.get_tensor()``. + +.. code-block:: python + + # linreg test + X = np.array([[1.0], [2.0], [3.0], [4.0], [5.0]]).astype(np.float32) + client.put_tensor("X", X) + client.run_model("linreg", inputs=["X"], outputs=["Y"]) + Y = client.get_tensor("Y") + + +For details about ``Client.run_model()``, please +refer to :ref:`SmartRedis API `. diff --git a/doc/orchestrator.rst b/doc/orchestrator.rst index cf3cabc96..456d9a814 100644 --- a/doc/orchestrator.rst +++ b/doc/orchestrator.rst @@ -41,7 +41,7 @@ be used to execute the AI models, and Torchscript code on data stored within it. Users do not need to know how the data is stored in a clustered configuration and can address the cluster with the SmartRedis clients like a single block of memory using simple put/get semantics in SmartRedis. SmartRedis will ensure that data -is evenly distributed amoungst all nodes in the cluster. +is evenly distributed amongst all nodes in the cluster. The cluster deployment is optimal for high data throughput scenarios such as online analysis, training and processing. @@ -116,7 +116,7 @@ occupied by the database. .. note:: Pinning _only_ affects the co-located deployment because both the application and the database - are sharing the same compute node. For the clustered deployment, a shard occupies the entirerty + are sharing the same compute node. For the clustered deployment, a shard occupies the entirety of the node. Redis @@ -165,3 +165,475 @@ A full example of configuring KeyDB to run in SmartSim is shown below export REDIS_CONF=/path/to/keydb.conf # run smartsim workload + +Multiple Orchestrator Example +============================= +SmartSim offers functionality to automate the deployment of multiple +databases, supporting workloads that require multiple +``Orchestrators`` for a ``Experiment``. For instance, a workload may consist of a +simulation with high inference performance demands (necessitating a co-located deployment), +along with an analysis and +visualization workflow connected to the simulation (requiring a standard orchestrator). +In the following example, we simulate a simple version of this use case. + +The example is comprised of two script files: + +* The :ref:`Application Script` +* The :ref:`Experiment Driver Script` + +**The Application Script Overview:** +In this example, the application script is a python file that +contains instructions to complete computational +tasks. Applications are not limited to Python +and can also be written in C, C++ and Fortran. +This script specifies creating a Python SmartRedis client for each +standard orchestrator and a colocated orchestrator. We use the +clients to request data from both standard databases, then +transfer the data to the colocated database. The application +file is launched by the experiment driver script +through a ``Model`` stage. + +**The Application Script Contents:** + +1. Connecting SmartRedis clients within the application to retrieve tensors + from the standard databases to store in a colocated database. Details in section: + :ref:`Initialize the Clients`. + +**The Experiment Driver Script Overview:** +The experiment driver script holds the stages of the workflow +and manages their execution through the ``Experiment`` API. +We initialize an Experiment +at the beginning of the Python file and use the ``Experiment`` to +iteratively create, configure and launch computational kernels +on the system through the `slurm` launcher. +In the driver script, we use the ``Experiment`` to create and launch a ``Model`` instance that +runs the application. + +**The Experiment Driver Script Contents:** + +1. Launching two standard Orchestrators with unique identifiers. Details in section: + :ref:`Launch Multiple Orchestrators`. +2. Launching the application script with a co-located database. Details in section: + :ref:`Initialize a Colocated Model`. +3. Connecting SmartRedis clients within the driver script to send tensors to standard Orchestrators + for retrieval within the application. Details in section: + :ref:`Create Client Connections to Orchestrators`. + +Setup and run instructions can be found :ref:`here` + +The Application Script +---------------------- +Applications interact with the databases +through a SmartRedis client. +In this section, we write an application script +to demonstrate how to connect SmartRedis +clients in the context of multiple +launched databases. Using the clients, we retrieve tensors +from two databases launched in the driver script, then store +the tensors in the colocated database. + +.. note:: + The Experiment must be started to use the Orchestrators within the + application script. Otherwise, it will fail to connect. + Find the instructions on how to launch :ref:`here` + +To begin, import the necessary packages: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py + :language: python + :linenos: + :lines: 1-3 + +Initialize the Clients +^^^^^^^^^^^^^^^^^^^^^^ +To establish a connection with each database, +we need to initialize a new SmartRedis client for each +``Orchestrator``. + +Step 1: Initialize ConfigOptions +"""""""""""""""""""""""""""""""" +Since we are launching multiple databases within the experiment, +the SmartRedis ``ConfigOptions`` object is required when initializing +a client in the application. +We use the ``ConfigOptions.create_from_environment()`` +function to create three instances of ``ConfigOptions``, +with one instance associated with each launched ``Orchestrator``. +Most importantly, to associate each launched Orchestrator to a ConfigOptions object, +the ``create_from_environment()`` function requires specifying the unique database identifier +argument named `db_identifier`. + +For the single-sharded database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py + :language: python + :linenos: + :lines: 5-6 + +For the multi-sharded database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py + :language: python + :linenos: + :lines: 10-11 + +For the colocated database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py + :language: python + :linenos: + :lines: 15-16 + +Step 2: Initialize the Client Connections +""""""""""""""""""""""""""""""""""""""""" +Now that we have three ``ConfigOptions`` objects, we have the +tools necessary to initialize three SmartRedis clients and +establish a connection with the three databases. +We use the SmartRedis ``Client`` API to create the client instances by passing in +the ``ConfigOptions`` objects and assigning a `logger_name` argument. + +Single-sharded database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py + :language: python + :linenos: + :lines: 7-8 + +Multi-sharded database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py + :language: python + :linenos: + :lines: 12-13 + +Colocated database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py + :language: python + :linenos: + :lines: 17-18 + +Retrieve Data and Store Using SmartRedis Client Objects +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +To confirm a successful connection to each database, we will retrieve the tensors +that we plan to store in the python driver script. After retrieving, we +store both tensors in the colocated database. +The ``Client.get_tensor()`` method allows +retrieval of a tensor. It requires the `name` of the tensor assigned +when sent to the database via ``Client.put_tensor()``. + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py + :language: python + :linenos: + :lines: 20-26 + +Later, when you run the experiment driver script the following output will appear in ``tutorial_model.out`` +located in ``getting-started-multidb/tutorial_model/``:: + + Model: single shard logger@00-00-00:The single sharded db tensor is: [1 2 3 4] + Model: multi shard logger@00-00-00:The multi sharded db tensor is: [5 6 7 8] + +This output showcases that we have established a connection with multiple Orchestrators. + +Next, take the tensors retrieved from the standard deployment databases and +store them in the colocated database using ``Client.put_tensor(name, data)``. + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py + :language: python + :linenos: + :lines: 28-30 + +Next, check if the tensors exist in the colocated database using ``Client.poll_tensor()``. +This function queries for data in the database. The function requires the tensor name (`name`), +how many milliseconds to wait in between queries (`poll_frequency_ms`), +and the total number of times to query (`num_tries`): + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py + :language: python + :linenos: + :lines: 32-37 + +The output will be as follows:: + + Model: colo logger@00-00-00:The colocated db has tensor_1: True + Model: colo logger@00-00-00:The colocated db has tensor_2: True + +The Experiment Driver Script +---------------------------- +To run the previous application, we must define workflow stages within a workload. +Defining workflow stages requires the utilization of functions associated +with the ``Experiment`` object. The Experiment object is intended to be instantiated +once and utilized throughout the workflow runtime. +In this example, we instantiate an ``Experiment`` object with the name ``getting-started-multidb``. +We setup the SmartSim ``logger`` to output information from the Experiment. + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 1-10 + +Launch Multiple Orchestrators +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In the context of this ``Experiment``, it's essential to create and launch +the databases as a preliminary step before any other components since +the application script requests tensors from the launched databases. + +We aim to showcase the multi-database automation capabilities of SmartSim, so we +create two databases in the workflow: a single-sharded database and a +multi-sharded database. + +Step 1: Initialize Orchestrators +"""""""""""""""""""""""""""""""" +To create an database, utilize the ``Experiment.create_database()`` function. +The function requires specifying a unique +database identifier argument named `db_identifier` to launch multiple databases. +This step is necessary to connect to databases outside of the driver script. +We will use the `db_identifier` names we specified in the application script. + +For the single-sharded database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 12-14 + +For the multi-sharded database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 16-18 + +.. note:: + Calling ``exp.generate()`` will create two subfolders + (one for each Orchestrator created in the previous step) + whose names are based on the db_identifier of that Orchestrator. + In this example, the Experiment folder is + named ``getting-started-multidb/``. Within this folder, two Orchestrator subfolders will + be created, namely ``single_shard_db_identifier/`` and ``multi_shard_db_identifier/``. + +Step 2: Start Databases +""""""""""""""""""""""" +Next, to launch the databases, +pass the database instances to ``Experiment.start()``. + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 20-21 + +The ``Experiment.start()`` function launches the ``Orchestrators`` for use within the workflow. In other words, the function +deploys the databases on the allocated compute resources. + +.. note:: + By setting `summary=True`, SmartSim will print a summary of the + experiment before it is launched. After printing the experiment summary, + the experiment is paused for 10 seconds giving the user time to + briefly scan the summary contents. If we set `summary=False`, then the experiment + would be launched immediately with no summary. + +Create Client Connections to Orchestrators +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The SmartRedis ``Client`` object contains functions that manipulate, send, and receive +data within the database. Each database has a single, dedicated SmartRedis ``Client``. +Begin by initializing a SmartRedis ``Client`` object per launched database. + +To create a designated SmartRedis ``Client``, you need to specify the address of the target +running database. You can easily retrieve this address using the ``Orchestrator.get_address()`` function. + +For the single-sharded database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 23-24 + +For the multi-sharded database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 25-26 + +Store Data Using Clients +^^^^^^^^^^^^^^^^^^^^^^^^ +In the application script, we retrieved two NumPy tensors. +To support the apps functionality, we will create two +NumPy arrays in the python driver script and send them to the a database. To +accomplish this, we use the ``Client.put_tensor()`` function with the respective +database client instances. + +For the single-sharded database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 28-31 + +For the multi-sharded database: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 33-36 + +Lets check to make sure the database tensors do not exist in the incorrect databases: + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 38-42 + +When you run the experiment, the following output will appear:: + + 00:00:00 system.host.com SmartSim[#####] INFO The multi shard array key exists in the incorrect database: False + 00:00:00 system.host.com SmartSim[#####] INFO The single shard array key exists in the incorrect database: False + +Initialize a Colocated Model +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In the next stage of the experiment, we +launch the application script with a co-located database +by configuring and creating +a SmartSim colocated ``Model``. + +Step 1: Configure +""""""""""""""""" +You can specify the run settings of a model. +In this experiment, we invoke the Python interpreter to run +the python script defined in section: :ref:`The Application Script`. +To configure this into a ``Model``, we use the ``Experiment.create_run_settings()`` function. +The function returns a ``RunSettings`` object. +When initializing the RunSettings object, +we specify the path to the application file, +`application_script.py`, for +``exe_args``, and the run command for ``exe``. + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 44-45 + +.. note:: + You will have to change the `exe_args` argument to the path of the application script + on your machine to run the example. + +With the ``RunSettings`` instance, +configure the the distribution of computational tasks (``RunSettings.set_nodes()``) and the number of instances +the script is execute on each node (``RunSettings.set_tasks_per_node()``). In this +example, we specify to SmartSim that we intend to execute the script once on a single node. + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 46-48 + +Step 2: Initialize +"""""""""""""""""" +Next, create a ``Model`` instance using the ``Experiment.create_model()``. +Pass the ``model_settings`` object as an argument +to the ``create_model()`` function and assign to the variable ``model``. + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 49-50 + +Step 2: Colocate +"""""""""""""""" +To colocate the model, use the ``Model.colocate_db_uds()`` function to +Colocate an Orchestrator instance with this Model over +a Unix domain socket connection. + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 51-52 + +This method will initialize settings which add an unsharded +database to this Model instance. Only this Model will be able +to communicate with this colocated database by using the loopback TCP interface. + +Step 3: Start +""""""""""""" +Next, launch the colocated model instance using the ``Experiment.start()`` function. + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 53-54 + +.. note:: + We set `block=True`, + so that ``Experiment.start()`` waits until the last Model has finished + before returning: it will act like a job monitor, letting us know + if processes run, complete, or fail. + +Cleanup Experiment +^^^^^^^^^^^^^^^^^^ +Finally, use the ``Experiment.stop()`` function to stop the database instances. Print the +workflow summary with ``Experiment.summary()``. + +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: + :lines: 56-59 + +When you run the experiment, the following output will appear:: + + 00:00:00 system.host.com SmartSim[#####]INFO + | | Name | Entity-Type | JobID | RunID | Time | Status | Returncode | + |----|------------------------------|---------------|-------------|---------|---------|-----------|--------------| + | 0 | colo_model | Model | 1556529.5 | 0 | 1.7437 | Completed | 0 | + | 1 | single_shard_db_identifier_0 | DBNode | 1556529.3 | 0 | 68.8732 | Cancelled | 0 | + | 2 | multi_shard_db_identifier_0 | DBNode | 1556529.4+2 | 0 | 45.5139 | Cancelled | 0 | + +How to Run the Example +---------------------- +Below are the steps to run the experiment. Find the +:ref:`experiment source code` +and :ref:`application source code` +below in the respective subsections. + +.. note:: + The example assumes that you have already installed and built + SmartSim and SmartRedis. Please refer to Section :ref:`Basic Installation` + for further details. For simplicity, we assume that you are + running on a SLURM-based HPC-platform. Refer to the steps below + for more details. + +Step 1 : Setup your directory tree + Your directory tree should look similar to below:: + + SmartSim/ + SmartRedis/ + Multi-db-example/ + application_script.py + experiment_script.py + + You can find the application and experiment source code in subsections below. + +Step 2 : Install and Build SmartSim + This example assumes you have installed SmartSim and SmartRedis in your + Python environment. We also assume that you have built SmartSim with + the necessary modules for the machine you are running on. + +Step 3 : Change the `exe_args` file path + When configuring the colocated model in `experiment_script.py`, + we pass the file path of `application_script.py` to the `exe_args` argument + on line 33 in :ref:`experiment_script.py`. + Edit this argument to the file path of your `application_script.py` + +Step 4 : Run the Experiment + Finally, run the experiment with ``python experiment_script.py``. + + +Application Source Code +^^^^^^^^^^^^^^^^^^^^^^^ +.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py + :language: python + :linenos: + +Experiment Source Code +^^^^^^^^^^^^^^^^^^^^^^ +.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py + :language: python + :linenos: \ No newline at end of file diff --git a/doc/overview.rst b/doc/overview.rst index 3c3e36691..3ef046bb0 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -59,8 +59,8 @@ deploying HPC workloads alongside an in-memory database: Redis. The key features of the IL are: - An API to start, monitor, and stop HPC jobs from Python or from a Jupyter notebook. - - Automated deployment of in-memory data staging (Redis) and computational - storage (RedisAI). + - Automated deployment of in-memory data staging (`Redis `_) and computational + storage (`RedisAI `_). - Programmatic launches of batch and in-allocation jobs on PBS, Slurm, LSF, and Cobalt systems. - Creating and configuring ensembles of workloads with isolated communication channels. diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index 643d17992..38d9c8052 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -1,10 +1,14 @@ -sphinx==4.4.0 -breathe==4.31.0 -sphinx-book-theme==0.2.0 +Sphinx==6.2.1 +breathe==4.35.0 sphinx-fortran==1.1.1 -nbsphinx>=0.8.2 -docutils==0.17 -torch==1.7.1 -tensorflow==2.5.2 +sphinx-book-theme==1.0.1 +sphinx-copybutton==0.5.2 +sphinx-tabs==3.4.4 +nbsphinx==0.9.3 +docutils==0.18.1 +torch==1.11.0 +tensorflow==2.8.1 ipython -jinja2==3.0.3 \ No newline at end of file +jinja2==3.1.2 +protobuf +numpy \ No newline at end of file diff --git a/doc/smartdashboard.rst b/doc/smartdashboard.rst new file mode 100644 index 000000000..532fa6db0 --- /dev/null +++ b/doc/smartdashboard.rst @@ -0,0 +1,7 @@ + +************** +SmartDashboard +************** + +.. include:: ../smartdashboard/doc/overview.rst + :start-line: 4 \ No newline at end of file diff --git a/doc/smartsim_zoo.rst b/doc/smartsim_zoo.rst index ce39fbaaa..1ce803bf6 100644 --- a/doc/smartsim_zoo.rst +++ b/doc/smartsim_zoo.rst @@ -1,51 +1,55 @@ -********************* +##################### Contributing Examples -********************* +##################### .. _smartsim_zoo: +========================= What Is the SmartSim Zoo? -######################### +========================= Given that SmartSim is a community developed and maintained project, we have -introduced a `SmartSim Example Zoo `_ +introduced a `SmartSim Example Zoo `__ that contains CrayLabs and user contributed examples of using SmartSim for various simulation and machine learning applications. +-------------------------------------- The Two Categories of the SmartSim Zoo -************************************** +-------------------------------------- 1. SmartSim Deployments (running SmartSim on various HPC Systems) * The source code for the repository serves the purpose of showing diverse - * examples of how to get SmartSim running on - different HPC Systems. If you are looking for working examples on a specific - machine, then the source code in the SmartSim-Zoo repository is for you. The - SmartSim development team strives to keep these examples updated with each - release so that users will always have robust examples for their needs. + + * examples of how to get SmartSim running on different HPC Systems. + If you are looking for working examples on a specific machine, then the source + code in the SmartSim-Zoo repository is for you. The + SmartSim development team strives to keep these examples updated with each + release so that users will always have robust examples for their needs. 2. SmartSim Applications (completed projects that use SmartSim) * The README for the repository describes some of the larger applications of * SmartSim. These examples fall under two categories: - examples by paper and examples by simulation model. The examples by paper are - based on existing research papers, and the examples by simulation models are - integrations of SmartSim with existing simulation models. + examples by paper and examples by simulation model. The examples by paper are + based on existing research papers, and the examples by simulation models are + integrations of SmartSim with existing simulation models. +================= How To Contribute -################# +================= We support, encourage, and welcome all contributions to the `SmartSim Zoo -`_ repository. Instructions for +`__ repository. Instructions for contributing examples varies whether you are contributing a SmartSim deployment or a SmartSim application. 1. Contributing SmartSim Deployment Examples * For contributing examples of SmartSim running on a HPC System, we ask that - you include a description and all references to code and relevant previous - implementations or open source code that the work is based on for the benefit - of anyone who would like to try out your example. + you include a description and all references to code and relevant previous + implementations or open source code that the work is based on for the benefit + of anyone who would like to try out your example. 2. Contributing SmartSim Application Examples @@ -60,12 +64,13 @@ or a SmartSim application. 7. Contact information 8. Citation +================= Existing Examples -################# +================= The subsequent tables summarize the examples provided in the SmartSim Zoo. You can find a more detailed description of each example in the `SmartSim Zoo -`_. +`__. .. list-table:: SmartSim Deployment Examples :widths: 50 100 @@ -95,33 +100,34 @@ can find a more detailed description of each example in the `SmartSim Zoo - Links * - DeepDriveMD - CrayLabs, Argonne National Lab, Oak Ridge National Lab - - `Implementation `_ `Original Paper `_ + - `Implementation `__ `Original Paper `__ * - TensorFlowFoam - CrayLabs, Argonne National Lab - - `Implementation `_ `Original Paper `_ + - `Implementation `__ `Original Paper `__ * - ML-EKE - CrayLabs, NCAR, University of Victoria - - `Implementation `_ `Paper `_ + - `Implementation `__ `Original Paper `__ * - LAMMPS + SmartSim - CrayLabs, Sandia National Laboratories - - `Implementation `_ `Forked Model `_ + - `Implementation `__ `Forked Model `__ +---------------------------------------- Summary of SmartSim Application Examples -######################################## +---------------------------------------- * **DeepDriveMD:** Based on the original DeepDriveMD work, extended to -orchestrate complex workflows with coupled applications without using the -filesystem for exchanging information. + orchestrate complex workflows with coupled applications without using the + filesystem for exchanging information. * **TensorFlowFoam:** Uses TensorFlow inside of OpenFOAM simulations using -SmartSim. Displays SmartSim's capability to evaluate a machine learning model -from within a simulation with minimal external library code and minimal API -calls. + SmartSim. Displays SmartSim's capability to evaluate a machine learning model + from within a simulation with minimal external library code and minimal API + calls. * **ML-EKE:** Runs an ensemble of simulations all using the SmartSim -architecture to replace a parameterization (MEKE) within each global ocean -simulation (MOM6). + architecture to replace a parameterization (MEKE) within each global ocean + simulation (MOM6). * **LAMMPS + SmartSim:** Implementation of a ``SMARTSIM`` dump style which uses -the SmartRedis clients to stream data to an Orchestrator database created by -SmartSim. + the SmartRedis clients to stream data to an Orchestrator database created by + SmartSim. diff --git a/doc/sr_advanced_topics.rst b/doc/sr_advanced_topics.rst new file mode 100644 index 000000000..30da2c578 --- /dev/null +++ b/doc/sr_advanced_topics.rst @@ -0,0 +1,2 @@ + +.. include:: ../smartredis/doc/advanced_topics.rst \ No newline at end of file diff --git a/doc/sr_cpp_walkthrough.rst b/doc/sr_cpp_walkthrough.rst index d5282bd6d..db27189e0 100644 --- a/doc/sr_cpp_walkthrough.rst +++ b/doc/sr_cpp_walkthrough.rst @@ -3,9 +3,9 @@ C++ *** -This section presents examples that use the SmartRedis C++ API to -interact with the RedisAI tensor, model, and script data types. -Additionally, this section demonstrates an example of utilizing +This section presents examples that use the SmartRedis C++ API to +interact with the RedisAI tensor, model, and script data types. +Additionally, this section demonstrates an example of utilizing the SmartRedis ``DataSet`` API. @@ -16,7 +16,7 @@ the SmartRedis ``DataSet`` API. .. note:: - The C++ API examples are written to connect to a clustered database + The C++ API examples are written to connect to a clustered database or clustered SmartSim Orchestrator. Update the ``Client`` constructor ``cluster`` flag to `false` to connect to a single shard (single compute host) database. @@ -39,7 +39,7 @@ DataSets The C++ client can store and retrieve tensors and metadata in datasets. For further information about datasets, please refer to the :ref:`Dataset -section of the Data Structures documentation page `. +section of the Data Structures documentation page `. The code below shows how to store and retrieve tensors and metadata that belong to a ``DataSet``. @@ -69,8 +69,8 @@ executes a preprocessing script. Scripts ======= -The example in :ref:`SR CPP Models` shows how to store and use -a PyTorch script in the database with the C++ Client. +The example in :ref:`SR CPP Models` shows how to store and use +a PyTorch script in the database with the C++ Client. The script is stored as a file in the ``../../../common/mnist_data/`` path relative to the compiled executable. Note that this example also sets and @@ -104,4 +104,3 @@ source code is also shown. :linenos: :language: Python :lines: 15-20 - diff --git a/doc/sr_fortran_walkthrough.rst b/doc/sr_fortran_walkthrough.rst index d2cf78576..f01545db8 100644 --- a/doc/sr_fortran_walkthrough.rst +++ b/doc/sr_fortran_walkthrough.rst @@ -1,9 +1,10 @@ +.. _fortran_client_examples: + ******* Fortran ******* -.. _fortran_client_examples: In this section, examples are presented using the SmartRedis Fortran diff --git a/doc/sr_python_walkthrough.rst b/doc/sr_python_walkthrough.rst index 3c7a66106..00c5bd0f2 100644 --- a/doc/sr_python_walkthrough.rst +++ b/doc/sr_python_walkthrough.rst @@ -16,7 +16,7 @@ This section details the SmartRedis Python client to demonstrate its general use Tensors ======= -The Python client can send and receive tensors from the Redis database, +The Python client can send and receive tensors from the Redis database, where they are stored as RedisAI data structures. Additionally, Python client API functions involving tensor data are compatible with Numpy arrays and do not require other data types. @@ -32,14 +32,14 @@ Datasets The Python client can store and retrieve tensors and metadata in datasets. For further information about datasets, please refer to the :ref:`Dataset -section of the Data Structures documentation page `. +section of the Data Structures documentation page `. The code below shows how to store and retrieve tensors that belong to a ``DataSet``. .. literalinclude:: ../smartredis/examples/serial/python/example_put_get_dataset.py :language: python :linenos: - :lines: 26-52 + :lines: 27-51 Models ====== @@ -52,14 +52,14 @@ jit-traced PyTorch model can be used with the Python client library. .. literalinclude:: ../smartredis/examples/serial/python/example_model_torch.py :language: python :linenos: - :lines: 26-71 + :lines: 27-70 Users can set models from a file, as shown in the code below. .. literalinclude:: ../smartredis/examples/serial/python/example_model_file_torch.py :language: python :linenos: - :lines: 26-69 + :lines: 27-68 Scripts ======= @@ -67,7 +67,7 @@ Scripts Scripts are a way to store python-executable code in the database. The Python client can send scripts to the dataset from a file or directly from memory. -The code below illustrates how to avoid storing a function in an intermediate file. +The code below illustrates how to avoid storing a function in an intermediate file. With this technique, we can define and send a function to the database on the fly. .. literalinclude:: ../smartredis/examples/serial/python/example_script.py @@ -75,7 +75,7 @@ With this technique, we can define and send a function to the database on the fl :linenos: :lines: 26-66 -The code below shows how to set a script from a file. Running the script set from +The code below shows how to set a script from a file. Running the script set from the file uses the same API calls as in the example shown above. .. literalinclude:: ../smartredis/examples/serial/python/example_script_file.py @@ -89,4 +89,3 @@ looks like this: .. literalinclude:: ../smartredis/examples/serial/python/data_processing_script.txt :language: python :linenos: - diff --git a/doc/testing.rst b/doc/testing.rst index c04b613f8..bdaa473d7 100644 --- a/doc/testing.rst +++ b/doc/testing.rst @@ -66,23 +66,23 @@ of the tests located within the ``on_wlm`` directory. To run the ``on_wlm`` test suite, users will have to be on a system with one of the supported workload managers. Additionally, users will -need to obtain an allocation of **at least 3 nodes**. +need to obtain an allocation of **at least 4 nodes**. Examples of how to obtain allocations on systems with the launchers: .. code:: bash # for slurm (with srun) - salloc -N 3 -A account --exclusive -t 00:10:00 + salloc -N 4 -A account --exclusive -t 00:10:00 # for PBSPro (with aprun) - qsub -l select=3 -l place=scatter -l walltime=00:10:00 -q queue + qsub -l select=4 -l place=scatter -l walltime=00:10:00 -q queue # for Cobalt (with aprun) - qsub -n 3 -t 00:10:00 -A account -q queue -I + qsub -n 4 -t 00:10:00 -A account -q queue -I # for LSF (with jsrun) - bsub -Is -W 00:30 -nnodes 3 -P project $SHELL + bsub -Is -W 00:30 -nnodes 4 -P project $SHELL Values for queue, account, or project should be substituted appropriately. @@ -123,7 +123,7 @@ A full example on an internal SLURM system .. code:: bash - salloc -N 3 -A account --exclusive -t 03:00:00 + salloc -N 4 -A account --exclusive -t 03:00:00 export SMARTSIM_TEST_LAUNCHER=slurm export SMARTSIM_TEST_INTERFACE=ipogif0 export SMARTSIM_TEST_DEVICE=gpu diff --git a/doc/tutorials/training.rst b/doc/tutorials/training.rst deleted file mode 100644 index 495ebbc09..000000000 --- a/doc/tutorials/training.rst +++ /dev/null @@ -1,153 +0,0 @@ - -=============== -Online Training -=============== - -Online training provides the ability to use dynamic processes as your training -data set. In SmartSim, training data can be any process using the SmartRedis clients -to store data inside of a deployed `Orchestrator` database. - -SmartSim includes utilities to help with online training workflows in PyTorch and TensorFlow -In this example, we show how to use ``smartsim.ml.tf`` to train a Neural Network implemented -in TensorFlow and Keras. - -In particular, we will be using two classes: -- ``smartsim.ml.data.TrainingUploader`` which streamlines the uploading of samples and corresponding targets to the DB -- ``smartsim.ml.tf.DataGenerator`` which is a Keras ``Generator`` which can be used to train a DNN, -and will download the samples from the DB updating the training set at the end of each epoch. - -The SmartSim ``Experiment`` will consist in one mock simulation (the ``producer``) uploading samples, -and one application (the ``training_service``) downloading the samples to train a DNN. - -A richer example, entirely implemented in Python, is available as a Jupyter Notebook in the -``tutorials`` section of the SmartSim repository. An equivalent example using PyTorch -instead of TensorFlow will soon be available in the same directory. - - -Producing and uploading the samples ------------------------------------ - -.. _ml_training_producer_code: - -The first application in the workflow, the ``producer`` will upload batches of samples at regular intervals, -mimicking the behavior of an iterative simulation. - -Since the ``training_service`` will use a ``smartsim.ml.tf.DynamicDataGenerator`` two download the samples, their -keys need to follow a pre-defined format. Assuming that only one process in the simulation -uploads the data, this format is ``_``. And for targets -(which can also be integer labels), the key format is ``_``. Both ```` -and ```` are user-defined, and will need to be used to initialize the -``smartsim.ml.tf.DynamicDataGenerator`` object. - -Assuming the simulation is written in Python, then the code would look like - -.. code-block:: python - - from SmartRedis import Client - # simulation initialization code - client = Client(cluster=False, address=None) - - for iteration in range(num_iterations): - # simulation code producing two tensors, data_points - # and data_values - client.put_tensor(f"points_{iteration}", data_points) - client.put_tensor(f"values_{iteration}", data_values) - - -For simple simulations, this is sufficient. But if the simulation -uses MPI, then each rank could upload a portion of the data set. In that case, -the format for sample and target keys will be ``__`` -and ``__``, where ```` can be, e.g. -the MPI rank id. - - -Downloading the samples and training the model ----------------------------------------------- - -The second part of the workflow is the ``training_service``, an application that -downloads the data uploaded by the ``producer`` and uses them to train a ML model. -Most importantly, the ``training_service`` needs to keep looking for new samples, -and download them as they are available. The training data set size thus needs to grow at -each ``producer`` iteration. - -In Keras, a ``Sequence`` represents a data set and can be passed to ``model.fit()``. -The class ``smartsim.ml.tf.DynamicDataGenerator`` is a Keras ``Sequence``, which updates -its data set at the end of each training epoch, looking for newly produced batches of samples. -A current limitation of the TensorFlow training algorithm is that it does not take -into account changes of size in the data sets once the training has started, i.e. it is always -assumed that the training (and validation) data does not change during the training. To -overcome this limitation, we need to train one epoch at the time. Thus, -following what we defined in the :ref:`producer section `, -the ``training_service`` would look like - -.. code-block:: python - - from smartsim.ml.tf import DynamicDataGenerator - generator = DynamicDataGenerator( - sample_prefix="points", - target_prefix="value", - batch_size=32, - cluster=False) - - model = # some ML model - # model initialization - - for epoch in range(100): - model.fit(generator, - steps_per_epoch=None, - epochs=epoch+1, - initial_epoch=epoch, - batch_size=generator.batch_size, - verbose=2) - - -Again, this is enough for simple simulations. If the simulation uses MPI, -then the ``DynamicDataGenerator`` needs to know about the possible sub-indices. For example, -if the simulation runs 8 MPI ranks, the ``DynamicDataGenerator`` initialization will -need to be adapted as follows - -.. code-block:: python - - generator = DynamicDataGenerator( - sample_prefix="points", - target_prefix="value", - batch_size=32, - cluster=False, - uploader_ranks=8) - - -Launching the experiment ------------------------- - -To launch the ``producer`` and the ``training_service`` as models -within a SmartSim ``Experiment``, we can use the following code: - -.. code-block:: python - - from smartsim import Experiment - from smartsim.database import Orchestrator - - db = Orchestrator(port=6780) - exp = Experiment("online-training", launcher="local") - - # producer - producer_script = "producer.py" - settings = exp.create_run_settings("python", exe_args=producer_script) - uploader_model = exp.create_model("producer", settings, enable_key_prefixing=True) - uploader_model.attach_generator_files(to_copy=producer_script) - - # training_service - training_script = "training_service.py" - settings = exp.create_run_settings("python", exe_args=training_script) - trainer_model = exp.create_model("training_service", settings) - trainer_model.register_incoming_entity(uploader_model) - - exp.start(db, uploader_model, block=False, summary=False) - exp.start(trainer_model, block=True, summary=False) - - -Two lines require attention, as they are needed by the ``DataGenerator`` to work: - - ``uploader_model.enable_key_prefixing()`` will ensure that the ``producer`` prefixes all tensor keys with its name - - ``trainer_model.register_incoming_entity(uploader_model)`` enables the ``DataGenerator`` in - the ``training_service`` to know that it needs to download samples produced by the ``producer`` - diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile index 8d4440ce7..6a5f82642 100644 --- a/docker/dev/Dockerfile +++ b/docker/dev/Dockerfile @@ -27,14 +27,15 @@ FROM ubuntu:20.04 LABEL maintainer="Cray Labs" + + ARG DEBIAN_FRONTEND="noninteractive" ENV TZ=US/Seattle -RUN useradd --system --create-home --shell /bin/bash -g root -G sudo craylabs - -RUN apt-get update \ +RUN useradd --system --create-home --shell /bin/bash -g root -G sudo craylabs && \ + apt-get update \ && apt-get install --no-install-recommends -y build-essential \ - git gcc make git-lfs wget libopenmpi-dev openmpi-bin \ + git gcc make git-lfs wget libopenmpi-dev openmpi-bin unzip \ python3-pip python3 python3-dev cmake \ && rm -rf /var/lib/apt/lists/* \ && ln -s /usr/bin/python3 /usr/bin/python @@ -45,16 +46,15 @@ RUN git clone https://github.com/CrayLabs/SmartRedis.git --branch develop --dept && cd smartredis \ && python -m pip install . - COPY . /home/craylabs/SmartSim RUN chown craylabs:root -R SmartSim USER craylabs -RUN cd SmartSim && SMARTSIM_SUFFIX=dev python -m pip install . +RUN cd SmartSim && SMARTSIM_SUFFIX=dev python -m pip install .[ml] -RUN python -m pip install smartsim[ml,dev]==0.5.1 jupyter jupyterlab matplotlib && \ +RUN export PATH=/home/craylabs/.local/bin:$PATH && \ echo "export PATH=/home/craylabs/.local/bin:$PATH" >> /home/craylabs/.bashrc && \ - export PATH=/home/craylabs/.local/bin:$PATH && \ + python -m pip install jupyter jupyterlab matplotlib && \ smart clobber && \ smart build --device cpu -v && \ chown craylabs:root -R /home/craylabs/.local && \ diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile index a27ae03c1..eee809910 100644 --- a/docker/docs/dev/Dockerfile +++ b/docker/docs/dev/Dockerfile @@ -42,17 +42,25 @@ RUN ln -s /usr/bin/python3 /usr/bin/python COPY . /usr/local/src/SmartSim/ WORKDIR /usr/local/src/SmartSim/ -# Install docs dependencies and SmartSim -RUN python -m pip install -r doc/requirements-doc.txt && \ - NO_CHECKS=1 SMARTSIM_SUFFIX=dev python -m pip install . - # Install smartredis RUN git clone https://github.com/CrayLabs/SmartRedis.git --branch develop --depth=1 smartredis \ && cd smartredis \ && python -m pip install . \ && rm -rf ~/.cache/pip -RUN cd doc/tutorials/ && \ - ln -s ../../tutorials/* . +# Install smartdashboard +RUN git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop --depth=1 smartdashboard \ + && cd smartdashboard \ + && python -m pip install . \ + && rm -rf ~/.cache/pip + +# Install docs dependencies and SmartSim +RUN python -m pip install -r doc/requirements-doc.txt \ + && NO_CHECKS=1 SMARTSIM_SUFFIX=dev python -m pip install . + +RUN mkdir -p doc/tutorials/ \ + && cd doc/tutorials/ \ + && rm -rf * \ + && ln -s ../../tutorials/* . RUN make docs diff --git a/docker/prod/Dockerfile b/docker/prod/Dockerfile index adb8593a4..628d9af60 100644 --- a/docker/prod/Dockerfile +++ b/docker/prod/Dockerfile @@ -35,7 +35,7 @@ ENV TZ=US/Seattle RUN useradd --system --create-home --shell /bin/bash -g root -G sudo craylabs && \ apt-get update \ && apt-get install --no-install-recommends -y build-essential \ - git gcc make git-lfs wget libopenmpi-dev openmpi-bin \ + git gcc make git-lfs wget libopenmpi-dev openmpi-bin unzip \ python3-pip python3 python3-dev cmake \ && rm -rf /var/lib/apt/lists/* \ && ln -s /usr/bin/python3 /usr/bin/python @@ -44,12 +44,11 @@ WORKDIR /home/craylabs COPY --chown=craylabs:root ./tutorials/ /home/craylabs/tutorials/ USER craylabs -RUN python -m pip install smartsim[ml]==0.5.1 jupyter jupyterlab matplotlib && \ +RUN export PATH=/home/craylabs/.local/bin:$PATH && \ echo "export PATH=/home/craylabs/.local/bin:$PATH" >> /home/craylabs/.bashrc && \ - export PATH=/home/craylabs/.local/bin:$PATH && \ + python -m pip install smartsim[ml]==0.6.0 jupyter jupyterlab matplotlib && \ smart build --device cpu -v && \ chown craylabs:root -R /home/craylabs/.local && \ rm -rf ~/.cache/pip -# remove non-jupyter notebook tutorials CMD ["/bin/bash", "-c", "PATH=/home/craylabs/.local/bin:$PATH /home/craylabs/.local/bin/jupyter lab --port 8888 --no-browser --ip=0.0.0.0"] diff --git a/pyproject.toml b/pyproject.toml index 6d952892b..60c33bee5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ build-backend = "setuptools.build_meta" [tool.black] line-length = 88 -target-version = ['py37', 'py38'] +target-version = ['py38', 'py39', 'py310'] exclude = ''' ( | \.egg @@ -51,6 +51,11 @@ exclude = ''' [tool.pytest.ini_options] log_cli = true log_cli_level = "debug" +markers = [ + "group_a: fast test subset a", + "group_b: fast test subset b", + "slow_tests: tests that take a long duration to complete", +] [tool.isort] # supress circular import warning @@ -102,6 +107,7 @@ module = [ "keras", "torch", "smartsim.ml.torch.*", # must solve/ignore inheritance issues + "watchdog", ] ignore_missing_imports = true ignore_errors = true diff --git a/setup.cfg b/setup.cfg index 297531e03..49419c7eb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,11 +57,15 @@ setup_requires = include_package_data = True python_requires = >=3.8,<3.11 - - [options.packages.find] +include = + smartsim* exclude = .third-party tests doc smartredis + +[options.package_data] +smartsim._core.bin = + * diff --git a/setup.py b/setup.py index 74e85b461..66a534456 100644 --- a/setup.py +++ b/setup.py @@ -167,6 +167,7 @@ def has_ext_modules(_placeholder): "tqdm>=4.50.2", "filelock>=3.4.2", "protobuf~=3.20", + "watchdog>=3.0.0", ] # Add SmartRedis at specific version @@ -174,9 +175,9 @@ def has_ext_modules(_placeholder): extras_require = { "dev": [ - "black>=20.8b1", + "black==24.1a1", "isort>=5.6.4", - "pylint>=2.10.0", + "pylint>=2.10.0,<3", "pytest>=6.0.0", "pytest-cov>=2.10.1", "click==8.0.2", @@ -187,7 +188,7 @@ def has_ext_modules(_placeholder): "types-redis", "types-tabulate", "types-tqdm", - "types-tensorflow", + "types-tensorflow==2.12.0.9", "types-setuptools", ], # see smartsim/_core/_install/buildenv.py for more details @@ -199,10 +200,6 @@ def has_ext_modules(_placeholder): setup( version=smartsim_version, install_requires=deps, - packages=["smartsim"], - package_data={"smartsim": [ - "_core/bin/*", - ]}, cmdclass={ "build_py": SmartSimBuild, "install": InstallPlatlib, diff --git a/smartsim/_core/_cli/__main__.py b/smartsim/_core/_cli/__main__.py index 68d22d14f..47df07048 100644 --- a/smartsim/_core/_cli/__main__.py +++ b/smartsim/_core/_cli/__main__.py @@ -24,14 +24,30 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os import sys from smartsim._core._cli.cli import default_cli +from smartsim._core._cli.utils import SMART_LOGGER_FORMAT +from smartsim.error.errors import SmartSimCLIActionCancelled +from smartsim.log import get_logger + +logger = get_logger("Smart", fmt=SMART_LOGGER_FORMAT) def main() -> int: smart_cli = default_cli() - return smart_cli.execute(sys.argv) + exception_trace_back_msg = "SmartSim exited with the following exception info:" + + try: + return smart_cli.execute(sys.argv) + except SmartSimCLIActionCancelled as ssi: + logger.info(str(ssi)) + logger.debug(exception_trace_back_msg, exc_info=ssi) + except KeyboardInterrupt as e: + logger.info("SmartSim was terminated by user") + logger.debug(exception_trace_back_msg, exc_info=e) + return os.EX_OK if __name__ == "__main__": diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index 4218d79db..474d96c8a 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -65,11 +65,8 @@ def check_py_onnx_version(versions: Versioner) -> None: msg = ( "An onnx wheel is not available for " f"Python {py_version.major}.{py_version.minor}. " - "Instead consider using Python 3.8 or 3.9 with Onnx " + "Instead consider using Python 3.8 or 3.9 for ONNX 1.11 support" ) - if sys.platform == "linux": - msg += "1.2.5 or " - msg += "1.2.7." raise SetupError(msg) _check_packages_in_python_env( { @@ -343,8 +340,8 @@ def _format_incompatible_python_env_message( missing: t.Iterable[str], conflicting: t.Iterable[str] ) -> str: indent = "\n\t" - fmt_list: t.Callable[[str, t.Iterable[str]], str] = ( - lambda n, l: f"{n}:{indent}{indent.join(l)}" if l else "" + fmt_list: t.Callable[[str, t.Iterable[str]], str] = lambda n, l: ( + f"{n}:{indent}{indent.join(l)}" if l else "" ) missing_str = fmt_list("Missing", missing) conflict_str = fmt_list("Conflicting", conflicting) @@ -359,7 +356,9 @@ def _format_incompatible_python_env_message( ) -def execute(args: argparse.Namespace) -> int: +def execute( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: verbose = args.v keydb = args.keydb device: _TDeviceStr = args.device @@ -419,7 +418,7 @@ def execute(args: argparse.Namespace) -> int: ) except (SetupError, BuildError) as e: logger.error(str(e)) - return 1 + return os.EX_SOFTWARE backends = installed_redisai_backends() backends_str = ", ".join(s.capitalize() for s in backends) if backends else "No" @@ -434,10 +433,10 @@ def execute(args: argparse.Namespace) -> int: check_py_onnx_version(versions) except (SetupError, BuildError) as e: logger.error(str(e)) - return 1 + return os.EX_SOFTWARE logger.info("SmartSim build complete!") - return 0 + return os.EX_OK def configure_parser(parser: argparse.ArgumentParser) -> None: diff --git a/smartsim/_core/_cli/clean.py b/smartsim/_core/_cli/clean.py index fcf051f0c..d8a85f8a9 100644 --- a/smartsim/_core/_cli/clean.py +++ b/smartsim/_core/_cli/clean.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import typing as t from smartsim._core._cli.utils import clean, get_install_path @@ -39,10 +40,14 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: ) -def execute(args: argparse.Namespace) -> int: +def execute( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: return clean(get_install_path() / "_core", _all=args.clobber) -def execute_all(args: argparse.Namespace) -> int: +def execute_all( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: args.clobber = True return execute(args) diff --git a/smartsim/_core/_cli/cli.py b/smartsim/_core/_cli/cli.py index ce2376c15..ef4c113e1 100644 --- a/smartsim/_core/_cli/cli.py +++ b/smartsim/_core/_cli/cli.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import os import typing as t from smartsim._core._cli.build import configure_parser as build_parser @@ -36,51 +37,69 @@ from smartsim._core._cli.clean import execute_all as clobber_execute from smartsim._core._cli.dbcli import execute as dbcli_execute from smartsim._core._cli.info import execute as info_execute +from smartsim._core._cli.plugin import plugins from smartsim._core._cli.site import execute as site_execute -from smartsim._core._cli.validate import ( - execute as validate_execute, - configure_parser as validate_parser, -) from smartsim._core._cli.utils import MenuItemConfig +from smartsim._core._cli.validate import configure_parser as validate_parser +from smartsim._core._cli.validate import execute as validate_execute class SmartCli: def __init__(self, menu: t.List[MenuItemConfig]) -> None: - self.menu: t.Dict[str, MenuItemConfig] = {item.command: item for item in menu} - parser = argparse.ArgumentParser( + self.menu: t.Dict[str, MenuItemConfig] = {} + self.parser = argparse.ArgumentParser( prog="smart", description="SmartSim command line interface", ) - self.parser = parser - self.args: t.Optional[argparse.Namespace] = None - subparsers = parser.add_subparsers( + self.subparsers = self.parser.add_subparsers( dest="command", required=True, metavar="", help="Available commands", ) - for cmd, item in self.menu.items(): - parser = subparsers.add_parser( - cmd, description=item.description, help=item.description - ) - if item.configurator: - item.configurator(parser) + self.register_menu_items(menu) + self.register_menu_items([plugin() for plugin in plugins]) def execute(self, cli_args: t.List[str]) -> int: if len(cli_args) < 2: self.parser.print_help() - return 0 + return os.EX_USAGE - app_args = cli_args[1:] - self.args = self.parser.parse_args(app_args) + app_args = cli_args[1:] # exclude the path to executable + subcommand = cli_args[1] # first positional arg is the subcommand - if not (menu_item := self.menu.get(app_args[0], None)): + menu_item = self.menu.get(subcommand, None) + if not menu_item: self.parser.print_help() - return 0 + return os.EX_USAGE - return menu_item.handler(self.args) + args = argparse.Namespace() + unparsed_args = [] + + if menu_item.is_plugin: + unparsed_args = app_args[1:] + else: + args = self.parser.parse_args(app_args) + + return menu_item.handler(args, unparsed_args) + + def _register_menu_item(self, item: MenuItemConfig) -> None: + parser = self.subparsers.add_parser( + item.command, description=item.description, help=item.description + ) + if item.configurator: + item.configurator(parser) + + if item.command in self.menu: + raise ValueError(f"{item.command} cannot overwrite existing CLI command") + + self.menu[item.command] = item + + def register_menu_items(self, menu_items: t.List[MenuItemConfig]) -> None: + for item in menu_items: + self._register_menu_item(item) def default_cli() -> SmartCli: diff --git a/smartsim/_core/_cli/dbcli.py b/smartsim/_core/_cli/dbcli.py index 22a376588..ce0975bc4 100644 --- a/smartsim/_core/_cli/dbcli.py +++ b/smartsim/_core/_cli/dbcli.py @@ -25,13 +25,17 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import os +import typing as t from smartsim._core._cli.utils import get_db_path -def execute(_args: argparse.Namespace) -> int: +def execute( + _args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: if db_path := get_db_path(): print(db_path) - return 0 + return os.EX_OK print("Database (Redis or KeyDB) dependencies not found") - return 1 + return os.EX_SOFTWARE diff --git a/smartsim/_core/_cli/info.py b/smartsim/_core/_cli/info.py index 35ee9b9ec..c08fcb1a3 100644 --- a/smartsim/_core/_cli/info.py +++ b/smartsim/_core/_cli/info.py @@ -1,5 +1,6 @@ import argparse import importlib.metadata +import os import pathlib import typing as t @@ -12,7 +13,9 @@ _MISSING_DEP = _helpers.colorize("Not Installed", "red") -def execute(_args: argparse.Namespace, /) -> int: +def execute( + _args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: print("\nSmart Python Packages:") print( tabulate( @@ -66,7 +69,7 @@ def execute(_args: argparse.Namespace, /) -> int: ), end="\n\n", ) - return 0 + return os.EX_OK def _fmt_installed_db(db_path: t.Optional[pathlib.Path]) -> str: diff --git a/smartsim/_core/_cli/plugin.py b/smartsim/_core/_cli/plugin.py new file mode 100644 index 000000000..32c69b7e9 --- /dev/null +++ b/smartsim/_core/_cli/plugin.py @@ -0,0 +1,55 @@ +import argparse +import importlib.util +import os +import subprocess as sp +import sys +import typing as t + +import smartsim.log +from smartsim._core._cli.utils import SMART_LOGGER_FORMAT, MenuItemConfig +from smartsim.error.errors import SmartSimCLIActionCancelled + +_LOGGER = smartsim.log.get_logger("Smart", fmt=SMART_LOGGER_FORMAT) + + +def dynamic_execute( + cmd: str, plugin_name: str +) -> t.Callable[[argparse.Namespace, t.List[str]], int]: + def process_execute( + _args: argparse.Namespace, unparsed_args: t.List[str], / + ) -> int: + try: + spec = importlib.util.find_spec(cmd) + if spec is None: + raise AttributeError + except (ModuleNotFoundError, AttributeError): + _LOGGER.error(f"{cmd} plugin not found. Please ensure it is installed") + return os.EX_CONFIG + + combined_cmd = [sys.executable, "-m", cmd] + unparsed_args + + try: + completed_proc = sp.run(combined_cmd, check=False) + except KeyboardInterrupt as ex: + msg = f"{plugin_name} terminated by user" + raise SmartSimCLIActionCancelled(msg) from ex + return completed_proc.returncode + + return process_execute + + +def dashboard() -> MenuItemConfig: + return MenuItemConfig( + "dashboard", + ( + "Start the SmartSim dashboard to monitor experiment output from a " + "graphical user interface. This requires that the SmartSim Dashboard " + "Package be installed. For more infromation please visit " + "https://github.com/CrayLabs/SmartDashboard" + ), + dynamic_execute("smartdashboard", "Dashboard"), + is_plugin=True, + ) + + +plugins = (dashboard,) diff --git a/smartsim/_core/_cli/site.py b/smartsim/_core/_cli/site.py index 5fe667cde..c86e0341b 100644 --- a/smartsim/_core/_cli/site.py +++ b/smartsim/_core/_cli/site.py @@ -25,10 +25,12 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import os +import typing as t from smartsim._core._cli.utils import get_install_path -def execute(_args: argparse.Namespace) -> int: +def execute(_args: argparse.Namespace, _unparsed_args: t.List[str], /) -> int: print(get_install_path()) - return 0 + return os.EX_OK diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py index d8fd4b68c..e31d0aed2 100644 --- a/smartsim/_core/_cli/utils.py +++ b/smartsim/_core/_cli/utils.py @@ -24,7 +24,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import importlib +import importlib.util +import os import shutil import subprocess as sp import sys @@ -110,7 +111,7 @@ def clean(core_path: Path, _all: bool = False) -> int: if removed: logger.info("Successfully removed SmartSim database installation") - return 0 + return os.EX_OK def get_db_path() -> t.Optional[Path]: @@ -121,7 +122,7 @@ def get_db_path() -> t.Optional[Path]: return None -_CliHandler = t.Callable[[Namespace], int] +_CliHandler = t.Callable[[Namespace, t.List[str]], int] _CliParseConfigurator = t.Callable[[ArgumentParser], None] @@ -132,8 +133,10 @@ def __init__( description: str, handler: _CliHandler, configurator: t.Optional[_CliParseConfigurator] = None, + is_plugin: bool = False, ): self.command = cmd self.description = description self.handler = handler self.configurator = configurator + self.is_plugin = is_plugin diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index df1d331e0..bda254859 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -82,7 +82,9 @@ def __exit__( self._finalizer.detach() # type: ignore[attr-defined] -def execute(args: argparse.Namespace, /) -> int: +def execute( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: """Validate the SmartSim installation works as expected given a simple experiment """ @@ -101,10 +103,11 @@ def execute(args: argparse.Namespace, /) -> int: logger.error( "SmartSim failed to run a simple experiment!\n" f"Experiment failed due to the following exception:\n{e}\n\n" - f"Output files are available at `{temp_dir}`" + f"Output files are available at `{temp_dir}`", + exc_info=True, ) - return 2 - return 0 + return os.EX_SOFTWARE + return os.EX_OK def configure_parser(parser: argparse.ArgumentParser) -> None: @@ -138,6 +141,7 @@ def test_install( with_onnx: bool, ) -> None: exp = Experiment("ValidationExperiment", exp_path=location, launcher="local") + exp.disable_telemetry() port = _find_free_port() if port is None else port with _make_managed_local_orc(exp, port) as client: logger.info("Verifying Tensor Transfer") @@ -164,7 +168,7 @@ def _make_managed_local_orc( exp.start(orc) try: (client_addr,) = orc.get_address() - yield Client(address=client_addr, cluster=False) + yield Client(False, address=client_addr) finally: exp.stop(orc) diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index c6a050b4d..eaa2c68bd 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -46,9 +46,12 @@ # to remove # https://setuptools.pypa.io/en/latest/pkg_resources.html +# isort: off import pkg_resources from pkg_resources import packaging # type: ignore +# isort: on + Version = packaging.version.Version InvalidVersion = packaging.version.InvalidVersion DbEngine = t.Literal["REDIS", "KEYDB"] @@ -189,17 +192,6 @@ class RedisAIVersion(Version_): """ defaults = { - "1.2.5": { - "tensorflow": "2.6.2", - "onnx": "1.9.0", - "skl2onnx": "1.10.3", - "onnxmltools": "1.10.0", - "scikit-learn": "1.0.2", - "torch": "1.9.1", - "torch_cpu_suffix": "+cpu", - "torch_cuda_suffix": "+cu111", - "torchvision": "0.10.1", - }, "1.2.7": { "tensorflow": "2.8.0", "onnx": "1.11.0", @@ -214,14 +206,10 @@ class RedisAIVersion(Version_): } # Remove options with unsported wheels for python>=3.10 if sys.version_info >= (3, 10): - defaults.pop("1.2.5") defaults["1.2.7"].pop("onnx") defaults["1.2.7"].pop("skl2onnx") defaults["1.2.7"].pop("onnxmltools") defaults["1.2.7"].pop("scikit-learn") - # Remove incompatible RAI versions for OSX - if sys.platform == "darwin": - defaults.pop("1.2.5", None) def __init__(self, vers: str) -> None: # pylint: disable=super-init-not-called min_rai_version = min(Version_(ver) for ver in self.defaults) @@ -288,8 +276,8 @@ class Versioner: PYTHON_MIN = Version_("3.8.0") # Versions - SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.5.1")) - SMARTREDIS = Version_(get_env("SMARTREDIS_VERSION", "0.4.2")) + SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.6.0")) + SMARTREDIS = Version_(get_env("SMARTREDIS_VERSION", "0.5.0")) SMARTSIM_SUFFIX = get_env("SMARTSIM_SUFFIX", "") # Redis @@ -362,9 +350,7 @@ def ml_extras_required(self) -> t.Dict[str, t.List[str]]: for field in _torch_fields: ml_defaults.pop(field) - return { - "ml": [f"{lib}=={vers}" for lib, vers in ml_defaults.items()] - } + return {"ml": [f"{lib}=={vers}" for lib, vers in ml_defaults.items()]} @staticmethod def get_sha(setup_py_dir: Path) -> str: diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index 1e9125626..f96a9bb5f 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -351,21 +351,6 @@ def build_onnx(self) -> bool: def fetch_onnx(self) -> bool: return self.build_onnx - def copy_tf_cmake(self) -> None: - """Copy the FindTensorFlow.cmake file to the build directory - as the version included in RedisAI is out of date for us. - Note: opt/cmake/modules removed in RedisAI v1.2.5 - """ - # remove the previous version - tf_cmake = self.rai_build_path / "opt/cmake/modules/FindTensorFlow.cmake" - tf_cmake.resolve() - if tf_cmake.is_file(): - tf_cmake.unlink() - # copy ours in - self.copy_file( - self.bin_path / "modules/FindTensorFlow.cmake", tf_cmake, set_exe=False - ) - def symlink_libtf(self, device: str) -> None: """Add symbolic link to available libtensorflow in RedisAI deps. @@ -453,7 +438,7 @@ def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None # Circumvent a bad `get_deps.sh` script from RAI on 1.2.7 with ONNX # TODO: Look for a better way to do this or wait for RAI patch - if sys.platform == "darwin" and branch == "v1.2.7" and self.build_onnx: + if branch == "v1.2.7": # Clone RAI patch commit for OSX clone_cmd += ["RedisAI"] checkout_osx_fix = [ @@ -462,7 +447,7 @@ def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None "634916c722e718cc6ea3fad46e63f7d798f9adc2", ] else: - # Clone RAI release commit + # Clone RAI release commit for versions > 1.2.7 clone_cmd += [ "--branch", branch, @@ -476,9 +461,6 @@ def build_from_git(self, git_url: str, branch: str, device: str = "cpu") -> None checkout_osx_fix, out=subprocess.DEVNULL, cwd=self.rai_build_path ) - # copy FindTensorFlow.cmake to RAI cmake dir - self.copy_tf_cmake() - # get RedisAI dependencies dep_cmd = self._rai_build_env_prefix( with_pt=self.build_torch, diff --git a/smartsim/_core/bin/modules/FindTensorFlow.cmake b/smartsim/_core/bin/modules/FindTensorFlow.cmake deleted file mode 100644 index b7c9f8cb5..000000000 --- a/smartsim/_core/bin/modules/FindTensorFlow.cmake +++ /dev/null @@ -1,359 +0,0 @@ -# Patrick Wieschollek, -# FindTensorFlow.cmake -# https://github.com/PatWie/tensorflow-cmake/blob/master/cmake/modules/FindTensorFlow.cmake -# ------------- -# -# Find TensorFlow library and includes -# -# Automatically set variables have prefix "TensorFlow_", -# while environmental variables you can specify have prefix "TENSORFLOW_" -# This module will set the following variables in your project: -# -# ``TensorFlow_VERSION`` -# exact TensorFlow version obtained from runtime -# ``TensorFlow_ABI`` -# ABI specification of TensorFlow library obtained from runtime -# ``TensorFlow_INCLUDE_DIR`` -# where to find tensorflow header files obtained from runtime -# ``TensorFlow_LIBRARY`` -# the libraries to link against to use TENSORFLOW obtained from runtime -# ``TensorFlow_FOUND TRUE`` -# If false, do not try to use TENSORFLOW. -# ``TensorFlow_C_LIBRARY`` -# Path to tensorflow_cc library (libtensorflow[.so,.dylib,.dll], or similar) -# -# for some examples, you will need to specify on of the following cmake variables: -# ``TensorFlow_BUILD_DIR`` Is the directory containing the tensorflow_cc library, which can be initialized -# with env-var 'TENSORFLOW_BUILD_DIR' environmental variable -# ``TensorFlow_SOURCE_DIR`` Is the path to source of TensorFlow, which can be initialized -# with env-var 'TENSORFLOW_SOURCE_DIR' environmental variable -# -# -# USAGE -# ------ -# add "list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}../../path/to/this/file)" to your project -# -# "add_tensorflow_gpu_operation" is a macro to compile a custom operation -# -# add_tensorflow_gpu_operation("") expects the following files to exists: -# - kernels/_kernel.cc -# - kernels/_kernel_gpu.cu.cc (kernels/_kernel.cu is supported as well) -# - kernels/_op.cc -# - kernels/_op.h -# - ops/.cc - -if(APPLE) - message(WARNING "This FindTensorflow.cmake is not tested on APPLE\n" - "Please report if this works\n" - "https://github.com/PatWie/tensorflow-cmake") -endif() - -if(WIN32) - message(WARNING "This FindTensorflow.cmake is not tested on WIN32\n" - "Please report if this works\n" - "https://github.com/PatWie/tensorflow-cmake") -endif() - -set(PYTHON_EXECUTABLE "python3" CACHE STRING "specify the python version TensorFlow is installed on.") - -if(TensorFlow_FOUND AND EXISTS "${TensorFlow_LIBRARY}" AND IS_DIRECTORY "${TensorFlow_INCLUDE_DIR}") - # reuse cached variables - message(STATUS "Reuse cached information from TensorFlow ${TensorFlow_VERSION} ") -else() - message(STATUS "Detecting TensorFlow using ${PYTHON_EXECUTABLE}" - " (use -DPYTHON_EXECUTABLE=... otherwise)") - execute_process( - COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; print(tf.__version__); print(tf.__cxx11_abi_flag__); print(tf.sysconfig.get_include()); print(tf.sysconfig.get_lib());" - OUTPUT_VARIABLE TF_INFORMATION_STRING - OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE retcode) - - if(NOT "${retcode}" STREQUAL "0") - message(FATAL_ERROR "Detecting TensorFlow info - failed \n Did you installed TensorFlow?") - else() - message(STATUS "Detecting TensorFlow info - done") - endif() - - string(REPLACE "\n" ";" TF_INFORMATION_LIST ${TF_INFORMATION_STRING}) - list(GET TF_INFORMATION_LIST 0 TF_DETECTED_VERSION) - list(GET TF_INFORMATION_LIST 1 TF_DETECTED_ABI) - list(GET TF_INFORMATION_LIST 2 TF_DETECTED_INCLUDE_DIR) - list(GET TF_INFORMATION_LIST 3 TF_DETECTED_LIBRARY_PATH) - - # set(TF_DETECTED_VERSION 1.8) - - set(_packageName "TF") - if (DEFINED TF_DETECTED_VERSION) - string (REGEX MATCHALL "[0-9]+" _versionComponents "${TF_DETECTED_VERSION}") - list (LENGTH _versionComponents _len) - if (${_len} GREATER 0) - list(GET _versionComponents 0 TF_DETECTED_VERSION_MAJOR) - endif() - if (${_len} GREATER 1) - list(GET _versionComponents 1 TF_DETECTED_VERSION_MINOR) - endif() - if (${_len} GREATER 2) - list(GET _versionComponents 2 TF_DETECTED_VERSION_PATCH) - endif() - if (${_len} GREATER 3) - list(GET _versionComponents 3 TF_DETECTED_VERSION_TWEAK) - endif() - set (TF_DETECTED_VERSION_COUNT ${_len}) - else() - set (TF_DETECTED_VERSION_COUNT 0) - endif() - - - # -- prevent pre 1.9 versions - # Note: TensorFlow 1.7 supported custom ops and all header files. - # TensorFlow 1.8 broke that promise and 1.9, 1.10 are fine again. - # This cmake-file is only tested against 1.9+. - if("${TF_DETECTED_VERSION}" VERSION_LESS "1.9") - message(FATAL_ERROR "Your installed TensorFlow version ${TF_DETECTED_VERSION} is too old.") - endif() - - if(TF_FIND_VERSION_EXACT) - # User requested exact match of TensorFlow. - # TensorFlow release cycles are currently just depending on (major, minor) - # But we test against both. - set(_TensorFlow_TEST_VERSIONS - "${TF_FIND_VERSION_MAJOR}.${TF_FIND_VERSION_MINOR}.${TF_FIND_VERSION_PATCH}" - "${TF_FIND_VERSION_MAJOR}.${TF_FIND_VERSION_MINOR}") - else() # TF_FIND_VERSION_EXACT - # User requested not an exact TensorFlow version. - # However, only TensorFlow versions 1.9, 1.10 support all header files - # for custom ops. - set(_TensorFlow_KNOWN_VERSIONS ${TensorFlow_ADDITIONAL_VERSIONS} - "1.9" "1.9.0" "1.10" "1.10.0" "1.11" "1.11.0" "1.12" "1.12.0" "1.13" "1.13.1" "1.14" "2.4" "2.5" "2.6") - set(_TensorFlow_TEST_VERSIONS) - - if(TF_FIND_VERSION) - set(_TF_FIND_VERSION_SHORT "${TF_FIND_VERSION_MAJOR}.${TF_FIND_VERSION_MINOR}") - # Select acceptable versions. - foreach(version ${_TensorFlow_KNOWN_VERSIONS}) - if(NOT "${version}" VERSION_LESS "${TF_FIND_VERSION}") - # This version is high enough. - list(APPEND _TensorFlow_TEST_VERSIONS "${version}") - endif() - endforeach() - else() # TF_FIND_VERSION - # Any version is acceptable. - set(_TensorFlow_TEST_VERSIONS "${_TensorFlow_KNOWN_VERSIONS}") - endif() - endif() - - #### ---- Configure TensorFlow_SOURCE_DIR - # Order of precidence is 1) CMake variable value, 2) Environmental Variable value - if(IS_DIRECTORY "${TensorFlow_SOURCE_DIR}") - set(TensorFlow_SOURCE_DIR "${TensorFlow_SOURCE_DIR}" CACHE PATH "directory containing the file 'libtensorflow_cc${CMAKE_SHARED_LIBRARY_SUFFIX}'") - else() - if(IS_DIRECTORY "$ENV{TENSORFLOW_SOURCE_DIR}") - set(TensorFlow_SOURCE_DIR "$ENV{TENSORFLOW_SOURCE_DIR}" CACHE PATH "source code for tensorflow (i.e. the git checkout directory of the source code)") - else() - set(TensorFlow_SOURCE_DIR "TensorFlow_SOURCE_DIR-NOTFOUND" CACHE PATH "source code for tensorflow (i.e. the git checkout directory of the source code)") - endif() - endif() - - # Report on status of cmake cache variable for TensorFlow_SOURCE_DIR - if(IS_DIRECTORY ${TensorFlow_SOURCE_DIR}) - message(STATUS "TensorFlow_SOURCE_DIR is ${TensorFlow_SOURCE_DIR}") - else() - # NOTE This is not a fatal error for backward compatibility ("custom_op test") - message(STATUS "No directory at 'TensorFlow_SOURCE_DIR:PATH=${TensorFlow_SOURCE_DIR}' detected,\n" - "please specify the path in ENV 'export TENSORFLOW_SOURCE_DIR=...'\n or cmake -DTensorFlow_SOURCE_DIR:PATH=...\n" - "to the directory containing the source code for tensorflow\n (i.e. the git checkout directory of the source code)" - ) - endif() - - #### ---- Configure TensorFlow_BUILD_DIR - # Order of precidence is 1) CMake variable value, 2) Environmental Variable value - if(IS_DIRECTORY "${TensorFlow_BUILD_DIR}") - set(TensorFlow_BUILD_DIR "${TensorFlow_BUILD_DIR}" CACHE PATH "directory containing the file 'libtensorflow_cc${CMAKE_SHARED_LIBRARY_SUFFIX}'") - else() - if(IS_DIRECTORY "$ENV{TENSORFLOW_BUILD_DIR}") - set(TensorFlow_BUILD_DIR "$ENV{TENSORFLOW_BUILD_DIR}" CACHE PATH "directory containing the file 'libtensorflow_cc${CMAKE_SHARED_LIBRARY_SUFFIX}'") - else() - set(TensorFlow_BUILD_DIR "TensorFlow_BUILD_DIR-NOTFOUND" CACHE PATH "directory containing the file 'libtensorflow_cc${CMAKE_SHARED_LIBRARY_SUFFIX}'") - endif() - endif() - - # Report on status of cmake cache variable for TensorFlow_BUILD_DIR - if(IS_DIRECTORY ${TensorFlow_BUILD_DIR}) - message(STATUS "TensorFlow_BUILD_DIR is ${TensorFlow_BUILD_DIR}") - else() - # NOTE This is not a fatal error for backward compatibility ("custom_op test") - message(STATUS "No directory at 'TensorFlow_BUILD_DIR:PATH=${TensorFlow_BUILD_DIR}' detected,\n" - "please specify the path in ENV 'export TENSORFLOW_BUILD_DIR=...'\n or cmake -DTensorFlow_BUILD_DIR:PATH=...\n" - "to the directory containing the file 'libtensorflow_cc${CMAKE_SHARED_LIBRARY_SUFFIX}'" - ) - endif() - - if(IS_DIRECTORY ${TensorFlow_BUILD_DIR}) - file(GLOB_RECURSE TF_LIBRARY_SEARCH_PATHS - LIST_DIRECTORIES FALSE - "${TensorFlow_BUILD_DIR}/*libtensorflow_cc${CMAKE_SHARED_LIBRARY_SUFFIX}" - ) - list(LENGTH TF_LIBRARY_SEARCH_PATHS TF_LIBRARY_SEARCH_PATHS_LENGTH) - if( NOT ${TF_LIBRARY_SEARCH_PATHS_LENGTH} EQUAL 1 ) - message(FATAL_ERROR "Incorrect number of items matching 'libtensorflow_cc${CMAKE_SHARED_LIBRARY_SUFFIX}' in '${TF_LIBRARY_SEARCH_PATHS}'\n" - "( ${TF_LIBRARY_SEARCH_PATHS_LENGTH} != 1 ).\n" - "Change 'TensorFlow_BUILD_DIR' to have more specific path." - ) - endif() - list(GET TF_LIBRARY_SEARCH_PATHS 0 TF_LIBRARY_SEARCH_ONEPATH) - get_filename_component(TensorFlow_C_LIBRARY_DIR "${TF_LIBRARY_SEARCH_ONEPATH}" DIRECTORY ) - - if( IS_DIRECTORY "${TensorFlow_C_LIBRARY_DIR}") - find_library(TensorFlow_C_LIBRARY - NAMES tensorflow_cc - PATHS "${TensorFlow_C_LIBRARY_DIR}" - DOC "TensorFlow CC library." ) - endif() - if( TensorFlow_C_LIBRARY ) - message(STATUS "TensorFlow-CC-LIBRARY is ${TensorFlow_C_LIBRARY}") - else() - # NOTE This is not a fatal error for backward compatibility ("custom_op test") - message(STATUS "No TensorFlow-CC-LIBRARY detected") - endif() - endif() - - find_library( TF_DETECTED_LIBRARY - NAMES tensorflow - PATHS "${TensorFlow_C_LIBRARY_DIR}" # Prefer the library from the build tree, if TensorFlow_C_LIBRARY is detected. - "${TF_DETECTED_LIBRARY_PATH}" # use copy of file from the python install tree (This often has a .so.1 extension only for installed version) - DOC "The tensorflow_framework library path." - ) - if( TF_DETECTED_LIBRARY ) - message(STATUS "Found: ${TF_DETECTED_LIBRARY}") - else() - message(FATAL_ERROR "Required library for tensorflow_framework not found in ${TF_DETECTED_LIBRARY_PATH}!") - endif() - - # test all given versions - set(TensorFlow_FOUND FALSE) - foreach(_TensorFlow_VER ${_TensorFlow_TEST_VERSIONS}) - if("${TF_DETECTED_VERSION_MAJOR}.${TF_DETECTED_VERSION_MINOR}" STREQUAL "${_TensorFlow_VER}") - # found appropriate version - set(TensorFlow_VERSION ${TF_DETECTED_VERSION}) - set(TensorFlow_ABI ${TF_DETECTED_ABI}) - set(TensorFlow_INCLUDE_DIR ${TF_DETECTED_INCLUDE_DIR}) - set(TensorFlow_LIBRARY ${TF_DETECTED_LIBRARY}) - set(TensorFlow_FOUND TRUE) - message(STATUS "Found TensorFlow: (found appropriate version \"${TensorFlow_VERSION}\")") - message(STATUS "TensorFlow-ABI is ${TensorFlow_ABI}") - message(STATUS "TensorFlow-INCLUDE_DIR is ${TensorFlow_INCLUDE_DIR}") - message(STATUS "TensorFlow-LIBRARY is ${TensorFlow_LIBRARY}") - - add_definitions("-DTENSORFLOW_ABI=${TensorFlow_ABI}") - add_definitions("-DTENSORFLOW_VERSION=${TensorFlow_VERSION}") - break() - endif() - endforeach() - - if(NOT TensorFlow_FOUND) - message(FATAL_ERROR "Your installed TensorFlow version ${TF_DETECTED_VERSION_MAJOR}.${TF_DETECTED_VERSION_MINOR} is not supported\n" - "We tested against ${_TensorFlow_TEST_VERSIONS}") - endif() - - # test 1.11 version - if("${TF_DETECTED_VERSION}" VERSION_EQUAL "1.11") - set(TF_DISABLE_ASSERTS "TRUE") - endif() - - if("${TF_DETECTED_VERSION}" VERSION_EQUAL "1.12") - set(TF_DISABLE_ASSERTS "TRUE") - endif() - - if("${TF_DETECTED_VERSION}" VERSION_EQUAL "1.12.0") - set(TF_DISABLE_ASSERTS "TRUE") - endif() - - if("${TF_DETECTED_VERSION}" VERSION_EQUAL "1.13") - set(TF_DISABLE_ASSERTS "TRUE") - endif() - - if("${TF_DETECTED_VERSION}" VERSION_EQUAL "1.13.1") - set(TF_DISABLE_ASSERTS "TRUE") - endif() - -endif() #-- End detection - -if(${TF_DISABLE_ASSERTS}) - message(STATUS "[WARNING] The TensorFlow version ${TF_DETECTED_VERSION} has a bug (see \#22766). We disable asserts using -DNDEBUG=True ") - add_definitions("-DNDEBUG=True") -endif() -macro(TensorFlow_REQUIRE_C_LIBRARY) - if(NOT EXISTS "${TensorFlow_C_LIBRARY}") - message(FATAL_ERROR "Project requires libtensorflow_cc${CMAKE_SHARED_LIBRARY_SUFFIX}, please specify the path in ENV 'export TENSORFLOW_BUILD_DIR=...' or cmake -DTensorFlow_BUILD_DIR:PATH=...") - endif() -endmacro() - -macro(TensorFlow_REQUIRE_SOURCE) - if(NOT IS_DIRECTORY "${TensorFlow_SOURCE_DIR}") - message(FATAL_ERROR "Project requires TensorFlow source directory, please specify the path in ENV 'export TENSORFLOW_SOURCE_DIR=...' or cmake -DTensorFlow_SOURCE_DIR:PATH=...") - endif() -endmacro() - -macro(add_tensorflow_cpu_operation op_name) - # Compiles a CPU-only operation without invoking NVCC - message(STATUS "will build custom TensorFlow operation \"${op_name}\" (CPU only)") - - add_library(${op_name}_op SHARED kernels/${op_name}_op.cc kernels/${op_name}_kernel.cc ops/${op_name}.cc ) - - set_target_properties(${op_name}_op PROPERTIES PREFIX "") - target_link_libraries(${op_name}_op LINK_PUBLIC ${TensorFlow_LIBRARY}) -endmacro() - - -macro(add_tensorflow_gpu_operation op_name) -# Compiles a CPU + GPU operation with invoking NVCC - message(STATUS "will build custom TensorFlow operation \"${op_name}\" (CPU+GPU)") - - set(kernel_file "") - if(EXISTS "kernels/${op_name}_kernel.cu") - message(WARNING "you should rename your file ${op_name}_kernel.cu to ${op_name}_kernel_gpu.cu.cc") - set(kernel_file kernels/${op_name}_kernel.cu) - else() - set_source_files_properties(kernels/${op_name}_kernel_gpu.cu.cc PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ) - set(kernel_file kernels/${op_name}_kernel_gpu.cu.cc) - endif() - - cuda_add_library(${op_name}_op_cu SHARED ${kernel_file}) - set_target_properties(${op_name}_op_cu PROPERTIES PREFIX "") - - add_library(${op_name}_op SHARED kernels/${op_name}_op.cc kernels/${op_name}_kernel.cc ops/${op_name}.cc ) - - set_target_properties(${op_name}_op PROPERTIES PREFIX "") - set_target_properties(${op_name}_op PROPERTIES COMPILE_FLAGS "-DGOOGLE_CUDA") - target_link_libraries(${op_name}_op LINK_PUBLIC ${op_name}_op_cu ${TensorFlow_LIBRARY}) -endmacro() - -# simplify TensorFlow dependencies -add_library(TensorFlow_DEP INTERFACE) -target_include_directories(TensorFlow_DEP SYSTEM INTERFACE ${TensorFlow_SOURCE_DIR}) -target_include_directories(TensorFlow_DEP SYSTEM INTERFACE ${TensorFlow_INCLUDE_DIR}) -target_link_libraries(TensorFlow_DEP INTERFACE -Wl,--allow-multiple-definition -Wl,--whole-archive ${TensorFlow_C_LIBRARY} -Wl,--no-whole-archive) -target_link_libraries(TensorFlow_DEP INTERFACE -Wl,--allow-multiple-definition -Wl,--whole-archive ${TensorFlow_LIBRARY} -Wl,--no-whole-archive) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args( - TENSORFLOW - FOUND_VAR TENSORFLOW_FOUND - REQUIRED_VARS - TensorFlow_LIBRARY - TensorFlow_INCLUDE_DIR - VERSION_VAR - TensorFlow_VERSION - ) - -mark_as_advanced(TF_INFORMATION_STRING TF_DETECTED_VERSION TF_DETECTED_VERSION_MAJOR TF_DETECTED_VERSION_MINOR TF_DETECTED_VERSION TF_DETECTED_ABI - TF_DETECTED_INCLUDE_DIR TF_DETECTED_LIBRARY TF_DISABLE_ASSERTS - TensorFlow_C_LIBRARY TensorFlow_LIBRARY TensorFlow_SOURCE_DIR TensorFlow_INCLUDE_DIR TensorFlow_ABI) - -set(TensorFlow_INCLUDE_DIR "${TensorFlow_INCLUDE_DIR}" CACHE PATH "The path to tensorflow header files") -set(TensorFlow_VERSION "${TensorFlow_VERSION}" CACHE INTERNAL "The Tensorflow version") -set(TensorFlow_ABI "${TensorFlow_ABI}" CACHE STRING "The ABI version used by TensorFlow") -set(TensorFlow_LIBRARY "${TensorFlow_LIBRARY}" CACHE FILEPATH "The C++ library of TensorFlow") -set(TensorFlow_C_LIBRARY "${TensorFlow_C_LIBRARY}" CACHE STRING "The C library of TensorFlow") -set(TensorFlow_FOUND "${TensorFlow_FOUND}" CACHE BOOL "A flag stating if TensorFlow has been found") -set(TF_DISABLE_ASSERTS "${TF_DISABLE_ASSERTS}" CACHE BOOL "A flag to enable workarounds") diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 3b0905021..af5ebe508 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -24,13 +24,14 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import json import os -import psutil import typing as t - from functools import lru_cache from pathlib import Path +import psutil + from ...error import SSConfigError from ..utils.helpers import expand_exe_path @@ -143,6 +144,14 @@ def database_cli(self) -> str: "Specified Redis binary at REDIS_CLI_PATH could not be used" ) from e + @property + def database_file_parse_trials(self) -> int: + return int(os.getenv("SMARTSIM_DB_FILE_PARSE_TRIALS", "10")) + + @property + def database_file_parse_interval(self) -> int: + return int(os.getenv("SMARTSIM_DB_FILE_PARSE_INTERVAL", "2")) + @property def log_level(self) -> str: return os.environ.get("SMARTSIM_LOG_LEVEL", "info") @@ -171,6 +180,20 @@ def test_num_gpus(self) -> int: # pragma: no cover def test_port(self) -> int: # pragma: no cover return int(os.environ.get("SMARTSIM_TEST_PORT", 6780)) + @property + def test_batch_resources(self) -> t.Dict[t.Any, t.Any]: # pragma: no cover + resource_str = os.environ.get("SMARTSIM_TEST_BATCH_RESOURCES", "{}") + resources = json.loads(resource_str) + if not isinstance(resources, dict): + raise TypeError( + ( + "SMARTSIM_TEST_BATCH_RESOURCES was not interpreted as a " + "dictionary, check to make sure that it is a valid " + f"JSON string: {resource_str}" + ) + ) + return resources + @property def test_interface(self) -> t.List[str]: # pragma: no cover if interfaces_cfg := os.environ.get("SMARTSIM_TEST_INTERFACE", None): @@ -196,6 +219,18 @@ def test_account(self) -> t.Optional[str]: # pragma: no cover # no account by default return os.environ.get("SMARTSIM_TEST_ACCOUNT", None) + @property + def telemetry_frequency(self) -> int: + return int(os.environ.get("SMARTSIM_TELEMETRY_FREQUENCY", 5)) + + @property + def telemetry_enabled(self) -> bool: + return int(os.environ.get("SMARTSIM_FLAG_TELEMETRY", "0")) > 0 + + @property + def telemetry_cooldown(self) -> int: + return int(os.environ.get("SMARTSIM_TELEMETRY_COOLDOWN", 90)) + @lru_cache(maxsize=128, typed=False) def get_config() -> Config: diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index b3aae3fbc..e3e463c51 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -26,36 +26,58 @@ from __future__ import annotations +import itertools import os.path as osp +import pathlib import pickle import signal +import subprocess +import sys import threading import time import typing as t +from os import environ -from smartredis import Client +from smartredis import Client, ConfigOptions + +from smartsim._core.utils.network import get_ip_from_host from ..._core.launcher.step import Step -from ..._core.utils.redis import db_is_active, set_ml_model, set_script +from ..._core.utils.helpers import unpack_colo_db_identifier, unpack_db_identifier +from ..._core.utils.redis import ( + db_is_active, + set_ml_model, + set_script, + shutdown_db_node, +) from ...database import Orchestrator -from ...entity import EntityList, SmartSimEntity, Model, Ensemble -from ...error import LauncherError, SmartSimError, SSInternalError, SSUnsupportedError +from ...entity import Ensemble, EntityList, EntitySequence, Model, SmartSimEntity +from ...error import ( + LauncherError, + SmartSimError, + SSDBIDConflictError, + SSInternalError, + SSUnsupportedError, +) from ...log import get_logger -from ...status import STATUS_RUNNING, TERMINAL_STATUSES +from ...servertype import CLUSTERED, STANDALONE +from ...status import STATUS_CANCELLED, STATUS_RUNNING, TERMINAL_STATUSES from ..config import CONFIG from ..launcher import ( - SlurmLauncher, - PBSLauncher, - LocalLauncher, CobaltLauncher, + LocalLauncher, LSFLauncher, + PBSLauncher, + SlurmLauncher, ) from ..launcher.launcher import Launcher -from ..utils import check_cluster_status, create_cluster -from .jobmanager import JobManager -from .manifest import Manifest +from ..utils import check_cluster_status, create_cluster, serialize from .job import Job -from ...settings.base import BatchSettings +from .jobmanager import JobManager +from .manifest import LaunchedManifest, LaunchedManifestBuilder, Manifest + +if t.TYPE_CHECKING: + from ..utils.serialize import TStepLaunchMetaData logger = get_logger(__name__) @@ -78,9 +100,15 @@ def __init__(self, launcher: str = "local") -> None: """ self._jobs = JobManager(JM_LOCK) self.init_launcher(launcher) + self._telemetry_monitor: t.Optional[subprocess.Popen[bytes]] = None def start( - self, manifest: Manifest, block: bool = True, kill_on_interrupt: bool = True + self, + exp_name: str, + exp_path: str, + manifest: Manifest, + block: bool = True, + kill_on_interrupt: bool = True, ) -> None: """Start the passed SmartSim entities @@ -93,12 +121,20 @@ def start( self._jobs.kill_on_interrupt = kill_on_interrupt # register custom signal handler for ^C (SIGINT) signal.signal(signal.SIGINT, self._jobs.signal_interrupt) - self._launch(manifest) + launched = self._launch(exp_name, exp_path, manifest) # start the job manager thread if not already started if not self._jobs.actively_monitoring: self._jobs.start() + serialize.save_launch_manifest( + launched.map(_look_up_launched_data(self._launcher)) + ) + + # launch a telemetry monitor to track job progress + if CONFIG.telemetry_enabled: + self._start_telemetry_monitor(exp_path) + # block until all non-database jobs are complete if block: # poll handles its own keyboard interrupt as @@ -136,23 +172,25 @@ def poll( for job in to_monitor.values(): logger.info(job) - def finished(self, entity: t.Union[SmartSimEntity, EntityList]) -> bool: + def finished( + self, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] + ) -> bool: """Return a boolean indicating wether a job has finished or not :param entity: object launched by SmartSim. - :type entity: Entity | EntityList + :type entity: Entity | EntitySequence :returns: bool :raises ValueError: if entity has not been launched yet """ try: if isinstance(entity, Orchestrator): raise TypeError("Finished() does not support Orchestrator instances") - if isinstance(entity, EntityList): + if isinstance(entity, EntitySequence): return all(self.finished(ent) for ent in entity.entities) if not isinstance(entity, SmartSimEntity): raise TypeError( f"Argument was of type {type(entity)} not derived " - "from SmartSimEntity or EntityList" + "from SmartSimEntity or EntitySequence" ) return self._jobs.is_finished(entity) @@ -161,14 +199,16 @@ def finished(self, entity: t.Union[SmartSimEntity, EntityList]) -> bool: f"Entity {entity.name} has not been launched in this experiment" ) from None - def stop_entity(self, entity: t.Union[SmartSimEntity, EntityList]) -> None: + def stop_entity( + self, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] + ) -> None: """Stop an instance of an entity This function will also update the status of the job in the jobmanager so that the job appears as "cancelled". :param entity: entity to be stopped - :type entity: Entity | EntityList + :type entity: Entity | EntitySequence """ with JM_LOCK: job = self._jobs[entity.name] @@ -189,12 +229,38 @@ def stop_entity(self, entity: t.Union[SmartSimEntity, EntityList]) -> None: ) self._jobs.move_to_completed(job) - def stop_entity_list(self, entity_list: EntityList) -> None: + def stop_db(self, db: Orchestrator) -> None: + """Stop an orchestrator + :param db: orchestrator to be stopped + :type db: Orchestrator + """ + if db.batch: + self.stop_entity(db) + else: + with JM_LOCK: + for node in db.entities: + for host_ip, port in itertools.product( + (get_ip_from_host(host) for host in node.hosts), db.ports + ): + retcode, _, _ = shutdown_db_node(host_ip, port) + # Sometimes the DB will not shutdown (unless we force NOSAVE) + if retcode != 0: + self.stop_entity(node) + continue + + job = self._jobs[node.name] + job.set_status(STATUS_CANCELLED, "", 0, output=None, error=None) + self._jobs.move_to_completed(job) + + db.reset_hosts() + + def stop_entity_list(self, entity_list: EntitySequence[SmartSimEntity]) -> None: """Stop an instance of an entity list :param entity_list: entity list to be stopped - :type entity_list: EntityList + :type entity_list: EntitySequence """ + if entity_list.batch: self.stop_entity(entity_list) else: @@ -209,34 +275,40 @@ def get_jobs(self) -> t.Dict[str, Job]: with JM_LOCK: return self._jobs.completed - def get_entity_status(self, entity: t.Union[SmartSimEntity, EntityList]) -> str: + def get_entity_status( + self, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] + ) -> str: """Get the status of an entity :param entity: entity to get status of - :type entity: SmartSimEntity | EntityList - :raises TypeError: if not SmartSimEntity | EntityList + :type entity: SmartSimEntity | EntitySequence + :raises TypeError: if not SmartSimEntity | EntitySequence :return: status of entity :rtype: str """ - if not isinstance(entity, (SmartSimEntity, EntityList)): + if not isinstance(entity, (SmartSimEntity, EntitySequence)): raise TypeError( - "Argument must be of type SmartSimEntity or EntityList, " + "Argument must be of type SmartSimEntity or EntitySequence, " f"not {type(entity)}" ) return self._jobs.get_status(entity) - def get_entity_list_status(self, entity_list: EntityList) -> t.List[str]: + def get_entity_list_status( + self, entity_list: EntitySequence[SmartSimEntity] + ) -> t.List[str]: """Get the statuses of an entity list :param entity_list: entity list containing entities to get statuses of - :type entity_list: EntityList - :raises TypeError: if not EntityList + :type entity_list: EntitySequence + :raises TypeError: if not EntitySequence :return: list of str statuses :rtype: list """ - if not isinstance(entity_list, EntityList): - raise TypeError(f"Argument was of type {type(entity_list)} not EntityList") + if not isinstance(entity_list, EntitySequence): + raise TypeError( + f"Argument was of type {type(entity_list)} not EntitySequence" + ) if entity_list.batch: return [self.get_entity_status(entity_list)] statuses = [] @@ -275,63 +347,96 @@ def init_launcher(self, launcher: str) -> None: else: raise TypeError("Must provide a 'launcher' argument") - def _launch(self, manifest: Manifest) -> None: + def _launch( + self, exp_name: str, exp_path: str, manifest: Manifest + ) -> LaunchedManifest[t.Tuple[str, Step]]: """Main launching function of the controller Orchestrators are always launched first so that the address of the database can be given to following entities + :param exp_name: The name of the launching experiment + :type exp_name: str + :param exp_path: path to location of ``Experiment`` directory if generated + :type exp_path: str :param manifest: Manifest of deployables to launch :type manifest: Manifest """ - orchestrator = manifest.db - if orchestrator: + + manifest_builder = LaunchedManifestBuilder[t.Tuple[str, Step]]( + exp_name=exp_name, exp_path=exp_path, launcher_name=str(self._launcher) + ) + # Loop over deployables to launch and launch multiple orchestrators + for orchestrator in manifest.dbs: + for key in self._jobs.get_db_host_addresses(): + _, db_id = unpack_db_identifier(key, "_") + if orchestrator.db_identifier == db_id: + raise SSDBIDConflictError( + f"Database identifier {orchestrator.db_identifier}" + " has already been used. Pass in a unique" + " name for db_identifier" + ) + if orchestrator.num_shards > 1 and isinstance( self._launcher, LocalLauncher ): raise SmartSimError( "Local launcher does not support multi-host orchestrators" ) - if self.orchestrator_active: - msg = "Attempted to launch a second Orchestrator instance. " - msg += "Only 1 Orchestrator can be active at a time" - raise SmartSimError(msg) - self._launch_orchestrator(orchestrator) + self._launch_orchestrator(orchestrator, manifest_builder) if self.orchestrator_active: self._set_dbobjects(manifest) # create all steps prior to launch - steps: t.List[t.Tuple[Step, t.Union[SmartSimEntity, EntityList]]] = [] - all_entity_lists = manifest.ensembles - for elist in all_entity_lists: + steps: t.List[ + t.Tuple[Step, t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]]] + ] = [] + for elist in manifest.ensembles: + ens_telem_dir = manifest_builder.run_telemetry_subdirectory / "ensemble" if elist.batch: - batch_step = self._create_batch_job_step(elist) + batch_step, substeps = self._create_batch_job_step(elist, ens_telem_dir) + manifest_builder.add_ensemble( + elist, [(batch_step.name, step) for step in substeps] + ) steps.append((batch_step, elist)) else: # if ensemble is to be run as separate job steps, aka not in a batch - job_steps = [(self._create_job_step(e), e) for e in elist.entities] + job_steps = [ + (self._create_job_step(e, ens_telem_dir / elist.name), e) + for e in elist.entities + ] + manifest_builder.add_ensemble( + elist, [(step.name, step) for step, _ in job_steps] + ) steps.extend(job_steps) - # models themselves cannot be batch steps. If batch settings are # attached, wrap them in an anonymous batch job step for model in manifest.models: + model_telem_dir = manifest_builder.run_telemetry_subdirectory / "model" if model.batch_settings: - anon_entity_list = _AnonymousBatchJob( - model.name, model.path, model.batch_settings + anon_entity_list = _AnonymousBatchJob(model) + batch_step, _ = self._create_batch_job_step( + anon_entity_list, model_telem_dir ) - anon_entity_list.entities.append(model) - batch_step = self._create_batch_job_step(anon_entity_list) + manifest_builder.add_model(model, (batch_step.name, batch_step)) steps.append((batch_step, model)) else: - job_step = self._create_job_step(model) + job_step = self._create_job_step(model, model_telem_dir) + manifest_builder.add_model(model, (job_step.name, job_step)) steps.append((job_step, model)) # launch steps for step, entity in steps: self._launch_step(step, entity) - def _launch_orchestrator(self, orchestrator: Orchestrator) -> None: + return manifest_builder.finalize() + + def _launch_orchestrator( + self, + orchestrator: Orchestrator, + manifest_builder: LaunchedManifestBuilder[t.Tuple[str, Step]], + ) -> None: """Launch an Orchestrator instance This function will launch the Orchestrator instance and @@ -340,17 +445,32 @@ def _launch_orchestrator(self, orchestrator: Orchestrator) -> None: :param orchestrator: orchestrator to launch :type orchestrator: Orchestrator + :param manifest_builder: An `LaunchedManifestBuilder` to record the + names and `Step`s of the launched orchestrator + :type manifest_builder: LaunchedManifestBuilder[tuple[str, Step]] """ orchestrator.remove_stale_files() + orc_telem_dir = manifest_builder.run_telemetry_subdirectory / "database" # if the orchestrator was launched as a batch workload if orchestrator.batch: - orc_batch_step = self._create_batch_job_step(orchestrator) + orc_batch_step, substeps = self._create_batch_job_step( + orchestrator, orc_telem_dir + ) + manifest_builder.add_database( + orchestrator, [(orc_batch_step.name, step) for step in substeps] + ) self._launch_step(orc_batch_step, orchestrator) # if orchestrator was run on existing allocation, locally, or in allocation else: - db_steps = [(self._create_job_step(db), db) for db in orchestrator.dbnodes] + db_steps = [ + (self._create_job_step(db, orc_telem_dir / orchestrator.name), db) + for db in orchestrator.entities + ] + manifest_builder.add_database( + orchestrator, [(step.name, step) for step, _ in db_steps] + ) for db_step in db_steps: self._launch_step(*db_step) @@ -386,7 +506,9 @@ def _launch_orchestrator(self, orchestrator: Orchestrator) -> None: logger.debug(f"Orchestrator launched on nodes: {orchestrator.hosts}") def _launch_step( - self, job_step: Step, entity: t.Union[SmartSimEntity, EntityList] + self, + job_step: Step, + entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], ) -> None: """Use the launcher to launch a job step @@ -418,35 +540,52 @@ def _launch_step( self._jobs.add_job(job_step.name, job_id, entity, is_task) def _create_batch_job_step( - self, entity_list: t.Union[Orchestrator, Ensemble, _AnonymousBatchJob] - ) -> Step: + self, + entity_list: t.Union[Orchestrator, Ensemble, _AnonymousBatchJob], + telemetry_dir: pathlib.Path, + ) -> t.Tuple[Step, t.List[Step]]: """Use launcher to create batch job step :param entity_list: EntityList to launch as batch :type entity_list: EntityList - :return: job step instance - :rtype: Step + :param telemetry_dir: Path to a directory in which the batch job step + may write telemetry events + :type telemetry_dir: pathlib.Path + :return: batch job step instance and a list of run steps to be + executed within the batch job + :rtype: tuple[Step, list[Step]] """ if not entity_list.batch_settings: raise ValueError( "EntityList must have batch settings to be launched as batch" ) + telemetry_dir = telemetry_dir / entity_list.name batch_step = self._launcher.create_step( entity_list.name, entity_list.path, entity_list.batch_settings ) + batch_step.meta["entity_type"] = str(type(entity_list).__name__).lower() + batch_step.meta["status_dir"] = str(telemetry_dir / entity_list.name) + + substeps = [] for entity in entity_list.entities: # tells step creation not to look for an allocation entity.run_settings.in_batch = True - step = self._create_job_step(entity) + step = self._create_job_step(entity, telemetry_dir) + substeps.append(step) batch_step.add_to_batch(step) - return batch_step + return batch_step, substeps - def _create_job_step(self, entity: SmartSimEntity) -> Step: + def _create_job_step( + self, entity: SmartSimEntity, telemetry_dir: pathlib.Path + ) -> Step: """Create job steps for all entities with the launcher :param entity: an entity to create a step for :type entity: SmartSimEntity + :param telemetry_dir: Path to a directory in which the job step + may write telemetry events + :type telemetry_dir: pathlib.Path :return: the job step :rtype: Step """ @@ -455,6 +594,10 @@ def _create_job_step(self, entity: SmartSimEntity) -> Step: self._prep_entity_client_env(entity) step = self._launcher.create_step(entity.name, entity.path, entity.run_settings) + + step.meta["entity_type"] = str(type(entity).__name__).lower() + step.meta["status_dir"] = str(telemetry_dir / entity.name) + return step def _prep_entity_client_env(self, entity: Model) -> None: @@ -463,23 +606,42 @@ def _prep_entity_client_env(self, entity: Model) -> None: :param entity: The entity to retrieve connections from :type entity: Model """ + client_env: t.Dict[str, t.Union[str, int, float, bool]] = {} - addresses = self._jobs.get_db_host_addresses() - if addresses: - if len(addresses) <= 128: - client_env["SSDB"] = ",".join(addresses) - else: + address_dict = self._jobs.get_db_host_addresses() + + for db_id, addresses in address_dict.items(): + db_name, _ = unpack_db_identifier(db_id, "_") + if addresses: # Cap max length of SSDB - client_env["SSDB"] = ",".join(addresses[:128]) - if entity.incoming_entities: - client_env["SSKEYIN"] = ",".join( - [in_entity.name for in_entity in entity.incoming_entities] + client_env[f"SSDB{db_name}"] = ",".join(addresses[:128]) + + # Retrieve num_shards to append to client env + client_env[f"SR_DB_TYPE{db_name}"] = ( + CLUSTERED if len(addresses) > 1 else STANDALONE ) - if entity.query_key_prefixing(): - client_env["SSKEYOUT"] = entity.name + + if entity.incoming_entities: + client_env["SSKEYIN"] = ",".join( + [in_entity.name for in_entity in entity.incoming_entities] + ) + if entity.query_key_prefixing(): + client_env["SSKEYOUT"] = entity.name # Set address to local if it's a colocated model - if entity.colocated: + if entity.colocated and entity.run_settings.colocated_db_settings is not None: + db_name_colo = entity.run_settings.colocated_db_settings["db_identifier"] + + for key in address_dict: + _, db_id = unpack_db_identifier(key, "_") + if db_name_colo == db_id: + raise SSDBIDConflictError( + f"Database identifier {db_name_colo}" + " has already been used. Pass in a unique" + " name for db_identifier" + ) + + db_name_colo = unpack_colo_db_identifier(db_name_colo) if colo_cfg := entity.run_settings.colocated_db_settings: port = colo_cfg.get("port", None) socket = colo_cfg.get("unix_socket", None) @@ -488,13 +650,15 @@ def _prep_entity_client_env(self, entity: Model) -> None: "Co-located was configured for both TCP/IP and UDS" ) if port: - client_env["SSDB"] = f"127.0.0.1:{str(port)}" + client_env[f"SSDB{db_name_colo}"] = f"127.0.0.1:{str(port)}" elif socket: - client_env["SSDB"] = f"unix://{socket}" + client_env[f"SSDB{db_name_colo}"] = f"unix://{socket}" else: raise SSInternalError( "Colocated database was not configured for either TCP or UDS" ) + client_env[f"SR_DB_TYPE{db_name_colo}"] = STANDALONE + entity.run_settings.update_env(client_env) def _save_orchestrator(self, orchestrator: Orchestrator) -> None: @@ -550,7 +714,7 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: # TODO remove in favor of by node status check time.sleep(CONFIG.jm_interval) elif any(stat in TERMINAL_STATUSES for stat in statuses): - self.stop_entity_list(orchestrator) + self.stop_db(orchestrator) msg = "Orchestrator failed during startup" msg += f" See {orchestrator.path} for details" raise SmartSimError(msg) @@ -558,7 +722,7 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: logger.debug("Waiting for orchestrator instances to spin up...") except KeyboardInterrupt: logger.info("Orchestrator launch cancelled - requesting to stop") - self.stop_entity_list(orchestrator) + self.stop_db(orchestrator) # re-raise keyboard interrupt so the job manager will display # any running and un-killed jobs as this method is only called @@ -625,47 +789,119 @@ def _set_dbobjects(self, manifest: Manifest) -> None: if not manifest.has_db_objects: return - db_addresses = self._jobs.get_db_host_addresses() + address_dict = self._jobs.get_db_host_addresses() + for ( + db_id, + db_addresses, + ) in address_dict.items(): + db_name, name = unpack_db_identifier(db_id, "_") - hosts = list({address.split(":")[0] for address in db_addresses}) - ports = list({int(address.split(":")[-1]) for address in db_addresses}) + hosts = list({address.split(":")[0] for address in db_addresses}) + ports = list({int(address.split(":")[-1]) for address in db_addresses}) - if not db_is_active(hosts=hosts, ports=ports, num_shards=len(db_addresses)): - raise SSInternalError("Cannot set DB Objects, DB is not running") + if not db_is_active(hosts=hosts, ports=ports, num_shards=len(db_addresses)): + raise SSInternalError("Cannot set DB Objects, DB is not running") - client = Client(address=db_addresses[0], cluster=len(db_addresses) > 1) + environ[f"SSDB{db_name}"] = db_addresses[0] - for model in manifest.models: - if not model.colocated: - for db_model in model.db_models: + environ[f"SR_DB_TYPE{db_name}"] = ( + CLUSTERED if len(db_addresses) > 1 else STANDALONE + ) + + options = ConfigOptions.create_from_environment(name) + client = Client(options, logger_name="SmartSim") + + for model in manifest.models: + if not model.colocated: + for db_model in model.db_models: + set_ml_model(db_model, client) + for db_script in model.db_scripts: + set_script(db_script, client) + + for ensemble in manifest.ensembles: + for db_model in ensemble.db_models: set_ml_model(db_model, client) - for db_script in model.db_scripts: + for db_script in ensemble.db_scripts: set_script(db_script, client) + for entity in ensemble.models: + if not entity.colocated: + # Set models which could belong only + # to the entities and not to the ensemble + # but avoid duplicates + for db_model in entity.db_models: + if db_model not in ensemble.db_models: + set_ml_model(db_model, client) + for db_script in entity.db_scripts: + if db_script not in ensemble.db_scripts: + set_script(db_script, client) + + def _start_telemetry_monitor(self, exp_dir: str) -> None: + """Spawns a telemetry monitor process to keep track of the life times + of the processes launched through this controller. + + :param exp_dir: An experiment directory + :type exp_dir: str + """ + logger.debug("Starting telemetry monitor process") + if ( + self._telemetry_monitor is None + or self._telemetry_monitor.returncode is not None + ): + cmd = [ + sys.executable, + "-m", + "smartsim._core.entrypoints.telemetrymonitor", + "-exp_dir", + exp_dir, + "-frequency", + str(CONFIG.telemetry_frequency), + "-cooldown", + str(CONFIG.telemetry_cooldown), + ] + # pylint: disable-next=consider-using-with + self._telemetry_monitor = subprocess.Popen( + cmd, + stderr=sys.stderr, + stdout=sys.stdout, + cwd=str(pathlib.Path(__file__).parent.parent.parent), + shell=False, + ) - for ensemble in manifest.ensembles: - for db_model in ensemble.db_models: - set_ml_model(db_model, client) - for db_script in ensemble.db_scripts: - set_script(db_script, client) - for entity in ensemble.models: - if not entity.colocated: - # Set models which could belong only - # to the entities and not to the ensemble - # but avoid duplicates - for db_model in entity.db_models: - if db_model not in ensemble.db_models: - set_ml_model(db_model, client) - for db_script in entity.db_scripts: - if db_script not in ensemble.db_scripts: - set_script(db_script, client) - - -class _AnonymousBatchJob(EntityList): - def __init__( - self, name: str, path: str, batch_settings: BatchSettings, **kwargs: t.Any - ) -> None: - super().__init__(name, path) - self.batch_settings = batch_settings - def _initialize_entities(self, **kwargs: t.Any) -> None: - ... +class _AnonymousBatchJob(EntityList[Model]): + @staticmethod + def _validate(model: Model) -> None: + if model.batch_settings is None: + msg = "Unable to create _AnonymousBatchJob without batch_settings" + raise SmartSimError(msg) + + def __init__(self, model: Model) -> None: + self._validate(model) + super().__init__(model.name, model.path) + self.entities = [model] + self.batch_settings = model.batch_settings + + def _initialize_entities(self, **kwargs: t.Any) -> None: ... + + +def _look_up_launched_data( + launcher: Launcher, +) -> t.Callable[[t.Tuple[str, Step]], "TStepLaunchMetaData"]: + def _unpack_launched_data(data: t.Tuple[str, Step]) -> "TStepLaunchMetaData": + # NOTE: we cannot assume that the name of the launched step + # ``launched_step_name`` is equal to the name of the step referring to + # the entity ``step.name`` as is the case when an entity list is + # launched as a batch job + launched_step_name, step = data + launched_step_map = launcher.step_mapping[launched_step_name] + out_file, err_file = step.get_output_files() + return ( + launched_step_map.step_id, + launched_step_map.task_id, + launched_step_map.managed, + out_file, + err_file, + pathlib.Path(step.meta.get("status_dir", step.cwd)), + ) + + return _unpack_launched_data diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index 41b60d709..aa4ecce76 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -26,11 +26,45 @@ import time import typing as t +from dataclasses import dataclass -from ...entity import SmartSimEntity, EntityList +from ...entity import EntitySequence, SmartSimEntity from ...status import STATUS_NEW +@dataclass(frozen=True) +class _JobKey: + step_id: str + task_id: str + + +class JobEntity: + """API required for a job processed in the JobManager with support for + telemetry monitoring + """ + + def __init__(self) -> None: + self.name: str = "" + self.path: str = "" + self.step_id: str = "" + self.task_id: str = "" + self.type: str = "" + self.timestamp: int = 0 + self.status_dir: str = "" + + @property + def is_db(self) -> bool: + return self.type in ["orchestrator", "dbnode"] + + @property + def is_managed(self) -> bool: + return bool(self.step_id) + + @property + def key(self) -> _JobKey: + return _JobKey(self.step_id, self.task_id) + + class Job: """Keep track of various information for the controller. In doing so, continuously add various fields of information @@ -42,7 +76,7 @@ def __init__( self, job_name: str, job_id: t.Optional[str], - entity: t.Union[SmartSimEntity, EntityList], + entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity], launcher: str, is_task: bool, ) -> None: @@ -53,7 +87,7 @@ def __init__( :param job_id: The id associated with the job :type job_id: str :param entity: The SmartSim entity(list) associated with the job - :type entity: SmartSimEntity | EntityList + :type entity: SmartSimEntity | EntitySequence | JobEntity :param launcher: Launcher job was started with :type launcher: str :param is_task: process monitored by TaskManager (True) or the WLM (True) diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index 21fcf1223..d23543030 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -24,21 +24,22 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import itertools import time import typing as t -from threading import Thread, RLock +from collections import ChainMap +from threading import RLock, Thread from types import FrameType from ...database import Orchestrator -from ...entity import DBNode, SmartSimEntity, EntityList -from ...error import SmartSimError +from ...entity import DBNode, EntitySequence, SmartSimEntity from ...log import get_logger -from ...status import TERMINAL_STATUSES +from ...status import STATUS_NEVER_STARTED, TERMINAL_STATUSES from ..config import CONFIG -from ..launcher import LocalLauncher, Launcher +from ..launcher import Launcher, LocalLauncher from ..utils.network import get_ip_from_host -from .job import Job +from .job import Job, JobEntity logger = get_logger(__name__) @@ -145,13 +146,8 @@ def __getitem__(self, entity_name: str) -> Job: :rtype: Job """ with self._lock: - if entity_name in self.db_jobs: - return self.db_jobs[entity_name] - if entity_name in self.jobs: - return self.jobs[entity_name] - if entity_name in self.completed: - return self.completed[entity_name] - raise KeyError + entities = ChainMap(self.db_jobs, self.jobs, self.completed) + return entities[entity_name] def __call__(self) -> t.Dict[str, Job]: """Returns dictionary all jobs for () operator @@ -162,11 +158,18 @@ def __call__(self) -> t.Dict[str, Job]: all_jobs = {**self.jobs, **self.db_jobs} return all_jobs + def __contains__(self, key: str) -> bool: + try: + self[key] # pylint: disable=pointless-statement + return True + except KeyError: + return False + def add_job( self, job_name: str, job_id: t.Optional[str], - entity: t.Union[SmartSimEntity, EntityList], + entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity], is_task: bool = True, ) -> None: """Add a job to the job manager which holds specific jobs by type. @@ -176,7 +179,7 @@ def add_job( :param job_id: job step id created by launcher :type job_id: str :param entity: entity that was launched on job step - :type entity: SmartSimEntity | EntityList + :type entity: SmartSimEntity | EntitySequence :param is_task: process monitored by TaskManager (True) or the WLM (True) :type is_task: bool """ @@ -185,6 +188,8 @@ def add_job( job = Job(job_name, job_id, entity, launcher, is_task) if isinstance(entity, (DBNode, Orchestrator)): self.db_jobs[entity.name] = job + elif isinstance(entity, JobEntity) and entity.is_db: + self.db_jobs[entity.name] = job else: self.jobs[entity.name] = job @@ -231,25 +236,25 @@ def check_jobs(self) -> None: output=status.output, ) - def get_status(self, entity: t.Union[SmartSimEntity, EntityList]) -> str: + def get_status( + self, + entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], + ) -> str: """Return the status of a job. - :param entity: SmartSimEntity or EntityList instance - :type entity: SmartSimEntity | EntityList + :param entity: SmartSimEntity or EntitySequence instance + :type entity: SmartSimEntity | EntitySequence :returns: tuple of status """ with self._lock: - try: - if entity.name in self.completed: - return self.completed[entity.name].status + if entity.name in self.completed: + return self.completed[entity.name].status + if entity.name in self: job: Job = self[entity.name] # locked - except KeyError: - raise SmartSimError( - f"Entity {entity.name} has not been launched in this Experiment" - ) from None + return job.status - return job.status + return STATUS_NEVER_STARTED def set_launcher(self, launcher: Launcher) -> None: """Set the launcher of the job manager to a specific launcher instance @@ -301,21 +306,28 @@ def restart_job( else: self.jobs[entity_name] = job - def get_db_host_addresses(self) -> t.List[str]: + def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: """Retrieve the list of hosts for the database + for corresponding database identifiers - :return: list of host ip addresses - :rtype: list[str] + :return: dictionary of host ip addresses + :rtype: Dict[str, list] """ - addresses = [] + + address_dict: t.Dict[str, t.List[str]] = {} for db_job in self.db_jobs.values(): + addresses = [] if isinstance(db_job.entity, (DBNode, Orchestrator)): db_entity = db_job.entity - for combine in itertools.product(db_job.hosts, db_entity.ports): ip_addr = get_ip_from_host(combine[0]) addresses.append(":".join((ip_addr, str(combine[1])))) - return addresses + + dict_entry: t.List[str] = address_dict.get(db_entity.db_identifier, []) + dict_entry.extend(addresses) + address_dict[db_entity.db_identifier] = dict_entry + + return address_dict def set_db_hosts(self, orchestrator: Orchestrator) -> None: """Set the DB hosts in db_jobs so future entities can query this @@ -324,11 +336,13 @@ def set_db_hosts(self, orchestrator: Orchestrator) -> None: :type orchestrator: Orchestrator """ # should only be called during launch in the controller + with self._lock: if orchestrator.batch: self.db_jobs[orchestrator.name].hosts = orchestrator.hosts + else: - for dbnode in orchestrator.dbnodes: + for dbnode in orchestrator.entities: if not dbnode.is_mpmd: self.db_jobs[dbnode.name].hosts = [dbnode.host] else: diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 0ebac3c2e..62ab013e5 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -24,43 +24,52 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pathlib import typing as t +from dataclasses import dataclass, field from ...database import Orchestrator -from ...entity import EntityList, SmartSimEntity, Model, Ensemble +from ...entity import DBNode, Ensemble, EntitySequence, Model, SmartSimEntity from ...error import SmartSimError -from ..utils.helpers import fmt_dict +from ..utils import helpers as _helpers +from ..utils import serialize as _serialize + +_T = t.TypeVar("_T") +_U = t.TypeVar("_U") +_AtomicLaunchableT = t.TypeVar("_AtomicLaunchableT", Model, DBNode) + +if t.TYPE_CHECKING: + import os class Manifest: - """This class is used to keep track of all deployables generated by an experiment. - Different types of deployables (i.e. different `SmartSimEntity`-derived objects or - `EntityList`-derived objects) can be accessed by using the corresponding accessor. + """This class is used to keep track of all deployables generated by an + experiment. Different types of deployables (i.e. different + `SmartSimEntity`-derived objects or `EntitySequence`-derived objects) can + be accessed by using the corresponding accessor. Instances of ``Model``, ``Ensemble`` and ``Orchestrator`` can all be passed as arguments """ - def __init__(self, *args: SmartSimEntity) -> None: + def __init__( + self, *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] + ) -> None: self._deployables = list(args) self._check_types(self._deployables) self._check_names(self._deployables) self._check_entity_lists_nonempty() @property - def db(self) -> t.Optional[Orchestrator]: - """Return Orchestrator instances in Manifest + def dbs(self) -> t.List[Orchestrator]: + """Return a list of Orchestrator instances in Manifest :raises SmartSimError: if user added to databases to manifest - :return: orchestrator instances - :rtype: Orchestrator | None + :return: List of orchestrator instances + :rtype: list[Orchestrator] """ dbs = [item for item in self._deployables if isinstance(item, Orchestrator)] - - if len(dbs) > 1: - raise SmartSimError("User attempted to create more than one Orchestrator") - - return dbs[0] if dbs else None + return dbs @property def models(self) -> t.List[Model]: @@ -84,18 +93,16 @@ def ensembles(self) -> t.List[Ensemble]: return [e for e in self._deployables if isinstance(e, Ensemble)] @property - def all_entity_lists(self) -> t.List[EntityList]: + def all_entity_lists(self) -> t.List[EntitySequence[SmartSimEntity]]: """All entity lists, including ensembles and exceptional ones like Orchestrator :return: list of entity lists - :rtype: List[EntityList] + :rtype: List[EntitySequence[SmartSimEntity]] """ - _all_entity_lists: t.List[EntityList] = [] - _all_entity_lists.extend(self.ensembles) + _all_entity_lists: t.List[EntitySequence[SmartSimEntity]] = list(self.ensembles) - db = self.db - if db is not None: + for db in self.dbs: _all_entity_lists.append(db) return _all_entity_lists @@ -114,10 +121,10 @@ def _check_names(deployables: t.List[t.Any]) -> None: @staticmethod def _check_types(deployables: t.List[t.Any]) -> None: for deployable in deployables: - if not isinstance(deployable, (SmartSimEntity, EntityList)): + if not isinstance(deployable, (SmartSimEntity, EntitySequence)): raise TypeError( f"Entity has type {type(deployable)}, not " - + "SmartSimEntity or EntityList" + + "SmartSimEntity or EntitySequence" ) def _check_entity_lists_nonempty(self) -> None: @@ -152,17 +159,17 @@ def __str__(self) -> str: output += f"{model.batch_settings}\n" output += f"{model.run_settings}\n" if model.params: - output += f"Parameters: \n{fmt_dict(model.params)}\n" + output += f"Parameters: \n{_helpers.fmt_dict(model.params)}\n" output += "\n" - if self.db: + for adb in self.dbs: output += db_header - output += f"Shards: {self.db.num_shards}\n" - output += f"Port: {str(self.db.ports[0])}\n" - output += f"Network: {self.db._interfaces}\n" - output += f"Batch Launch: {self.db.batch}\n" - if self.db.batch: - output += f"{str(self.db.batch_settings)}\n" + output += f"Shards: {adb.num_shards}\n" + output += f"Port: {str(adb.ports[0])}\n" + output += f"Network: {adb._interfaces}\n" + output += f"Batch Launch: {adb.batch}\n" + if adb.batch: + output += f"{str(adb.batch_settings)}\n" output += "\n" return output @@ -171,15 +178,17 @@ def __str__(self) -> str: def has_db_objects(self) -> bool: """Check if any entity has DBObjects to set""" - def has_db_models(entity: t.Union[EntityList, Model]) -> bool: + def has_db_models( + entity: t.Union[EntitySequence[SmartSimEntity], Model] + ) -> bool: return len(list(entity.db_models)) > 0 - def has_db_scripts(entity: t.Union[EntityList, Model]) -> bool: + def has_db_scripts( + entity: t.Union[EntitySequence[SmartSimEntity], Model] + ) -> bool: return len(list(entity.db_scripts)) > 0 has_db_objects = False - for model in self.models: - has_db_objects |= hasattr(model, "_db_models") # Check if any model has either a DBModel or a DBScript # we update has_db_objects so that as soon as one check @@ -214,3 +223,130 @@ def has_db_scripts(entity: t.Union[EntityList, Model]) -> bool: # `has_db_objects` should be False here return has_db_objects + + +class _LaunchedManifestMetadata(t.NamedTuple): + run_id: str + exp_name: str + exp_path: str + launcher_name: str + + @property + def exp_telemetry_subdirectory(self) -> pathlib.Path: + return _format_exp_telemetry_path(self.exp_path) + + @property + def run_telemetry_subdirectory(self) -> pathlib.Path: + return _format_run_telemetry_path(self.exp_path, self.exp_name, self.run_id) + + @property + def manifest_file_path(self) -> pathlib.Path: + return self.exp_telemetry_subdirectory / _serialize.MANIFEST_FILENAME + + +@dataclass(frozen=True) +class LaunchedManifest(t.Generic[_T]): + """Immutable manifest mapping launched entities or collections of launched + entities to other pieces of external data. This is commonly used to map a + launch-able entity to its constructed ``Step`` instance without assuming + that ``step.name == job.name`` or querying the ``JobManager`` which itself + can be ephemeral. + """ + + metadata: _LaunchedManifestMetadata + models: t.Tuple[t.Tuple[Model, _T], ...] + ensembles: t.Tuple[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]], ...] + databases: t.Tuple[t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]], ...] + + def map(self, func: t.Callable[[_T], _U]) -> "LaunchedManifest[_U]": + def _map_entity_data( + fn: t.Callable[[_T], _U], + entity_list: t.Sequence[t.Tuple[_AtomicLaunchableT, _T]], + ) -> t.Tuple[t.Tuple[_AtomicLaunchableT, _U], ...]: + return tuple((entity, fn(data)) for entity, data in entity_list) + + return LaunchedManifest( + metadata=self.metadata, + models=_map_entity_data(func, self.models), + ensembles=tuple( + (ens, _map_entity_data(func, model_data)) + for ens, model_data in self.ensembles + ), + databases=tuple( + (db_, _map_entity_data(func, node_data)) + for db_, node_data in self.databases + ), + ) + + +@dataclass(frozen=True) +class LaunchedManifestBuilder(t.Generic[_T]): + """A class comprised of mutable collections of SmartSim entities that is + used to build a ``LaunchedManifest`` while going through the launching + process. + """ + + exp_name: str + exp_path: str + launcher_name: str + run_id: str = field(default_factory=_helpers.create_short_id_str) + + _models: t.List[t.Tuple[Model, _T]] = field(default_factory=list, init=False) + _ensembles: t.List[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]]] = field( + default_factory=list, init=False + ) + _databases: t.List[t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]]] = ( + field(default_factory=list, init=False) + ) + + @property + def exp_telemetry_subdirectory(self) -> pathlib.Path: + return _format_exp_telemetry_path(self.exp_path) + + @property + def run_telemetry_subdirectory(self) -> pathlib.Path: + return _format_run_telemetry_path(self.exp_path, self.exp_name, self.run_id) + + def add_model(self, model: Model, data: _T) -> None: + self._models.append((model, data)) + + def add_ensemble(self, ens: Ensemble, data: t.Sequence[_T]) -> None: + self._ensembles.append((ens, self._entities_to_data(ens.entities, data))) + + def add_database(self, db_: Orchestrator, data: t.Sequence[_T]) -> None: + self._databases.append((db_, self._entities_to_data(db_.entities, data))) + + @staticmethod + def _entities_to_data( + entities: t.Sequence[_AtomicLaunchableT], data: t.Sequence[_T] + ) -> t.Tuple[t.Tuple[_AtomicLaunchableT, _T], ...]: + if not entities: + raise ValueError("Cannot map data to an empty entity sequence") + if len(entities) != len(data): + raise ValueError( + f"Cannot map data sequence of length {len(data)} to entity " + f"sequence of length {len(entities)}" + ) + return tuple(zip(entities, data)) + + def finalize(self) -> LaunchedManifest[_T]: + return LaunchedManifest( + metadata=_LaunchedManifestMetadata( + self.run_id, self.exp_name, self.exp_path, self.launcher_name + ), + models=tuple(self._models), + ensembles=tuple(self._ensembles), + databases=tuple(self._databases), + ) + + +def _format_exp_telemetry_path( + exp_path: t.Union[str, "os.PathLike[str]"] +) -> pathlib.Path: + return pathlib.Path(exp_path, _serialize.TELMON_SUBDIR) + + +def _format_run_telemetry_path( + exp_path: t.Union[str, "os.PathLike[str]"], exp_name: str, run_id: str +) -> pathlib.Path: + return _format_exp_telemetry_path(exp_path) / f"{exp_name}/{run_id}" diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index ba82d355f..332d6e019 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -31,15 +31,13 @@ import sys import tempfile import typing as t - from pathlib import Path from subprocess import PIPE, STDOUT from types import FrameType - import filelock import psutil -from smartredis import Client +from smartredis import Client, ConfigOptions from smartredis.error import RedisConnectionError, RedisReplyError from smartsim._core.utils.network import current_ip @@ -75,15 +73,14 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: parser.add_argument("--file", type=str) parser.add_argument("--backend", type=str) parser.add_argument("--device", type=str) - parser.add_argument("--devices_per_node", type=int) + parser.add_argument("--devices_per_node", type=int, default=1) + parser.add_argument("--first_device", type=int, default=0) parser.add_argument("--batch_size", type=int, default=0) parser.add_argument("--min_batch_size", type=int, default=0) + parser.add_argument("--min_batch_timeout", type=int, default=0) parser.add_argument("--tag", type=str, default="") parser.add_argument("--inputs", nargs="+", default=None) parser.add_argument("--outputs", nargs="+", default=None) - - # Unused if we use SmartRedis - parser.add_argument("--min_batch_timeout", type=int, default=None) args = parser.parse_args(db_model) inputs = None @@ -99,28 +96,30 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: # devices_per_node being greater than one only applies to GPU devices if args.devices_per_node > 1 and args.device.lower() == "gpu": client.set_model_from_file_multigpu( - name, - args.file, - args.backend, - 0, - args.devices_per_node, - args.batch_size, - args.min_batch_size, - args.tag, - inputs, - outputs, + name=name, + model_file=args.file, + backend=args.backend, + first_gpu=args.first_device, + num_gpus=args.devices_per_node, + batch_size=args.batch_size, + min_batch_size=args.min_batch_size, + min_batch_timeout=args.min_batch_timeout, + tag=args.tag, + inputs=inputs, + outputs=outputs, ) else: client.set_model_from_file( - name, - args.file, - args.backend, - args.device, - args.batch_size, - args.min_batch_size, - args.tag, - inputs, - outputs, + name=name, + model_file=args.file, + backend=args.backend, + device=args.device, + batch_size=args.batch_size, + min_batch_size=args.min_batch_size, + min_batch_timeout=args.min_batch_timeout, + tag=args.tag, + inputs=inputs, + outputs=outputs, ) return name @@ -142,7 +141,8 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: parser.add_argument("--file", type=str) parser.add_argument("--backend", type=str) parser.add_argument("--device", type=str) - parser.add_argument("--devices_per_node", type=int) + parser.add_argument("--devices_per_node", type=int, default=1) + parser.add_argument("--first_device", type=int, default=0) args = parser.parse_args(db_script) if args.file and args.func: @@ -151,13 +151,15 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: if args.func: func = args.func.replace("\\n", "\n") if args.devices_per_node > 1 and args.device.lower() == "gpu": - client.set_script_multigpu(args.name, func, 0, args.devices_per_node) + client.set_script_multigpu( + args.name, func, args.first_device, args.devices_per_node + ) else: client.set_script(args.name, func, args.device) elif args.file: if args.devices_per_node > 1 and args.device.lower() == "gpu": client.set_script_from_file_multigpu( - args.name, args.file, 0, args.devices_per_node + args.name, args.file, args.first_device, args.devices_per_node ) else: client.set_script_from_file(args.name, args.file, args.device) @@ -173,18 +175,19 @@ def main( command: t.List[str], db_models: t.List[t.List[str]], db_scripts: t.List[t.List[str]], + db_identifier: str, ) -> None: global DBPID # pylint: disable=global-statement lo_address = current_ip("lo") - try: - ip_addresses = [ - current_ip(interface) for interface in network_interface.split(",") - ] - - except ValueError as e: - logger.warning(e) - ip_addresses = [] + ip_addresses = [] + if network_interface: + try: + ip_addresses = [ + current_ip(interface) for interface in network_interface.split(",") + ] + except ValueError as e: + logger.warning(e) if all(lo_address == ip_address for ip_address in ip_addresses) or not ip_addresses: cmd = command + [f"--bind {lo_address}"] @@ -212,6 +215,7 @@ def main( f"\n\tIP Address(es): {' '.join(ip_addresses + [lo_address])}" f"\n\tCommand: {' '.join(cmd)}\n\n" f"\n\t# of Database CPUs: {db_cpus}" + f"\n\tDatabase Identifier: {db_identifier}" ) except Exception as e: cleanup() @@ -233,7 +237,8 @@ def launch_db_scripts(client: Client, db_scripts: t.List[t.List[str]]) -> None: try: if db_models or db_scripts: try: - client = Client(cluster=False) + options = ConfigOptions.create_from_environment(db_identifier) + client = Client(options, logger_name="SmartSim") launch_models(client, db_models) launch_db_scripts(client, db_scripts) except (RedisConnectionError, RedisReplyError) as ex: @@ -291,6 +296,11 @@ def register_signal_handlers() -> None: arg_parser.add_argument( "+db_cpus", type=int, default=2, help="Number of CPUs to use for DB" ) + + arg_parser.add_argument( + "+db_identifier", type=str, default="", help="Database Identifier" + ) + arg_parser.add_argument("+command", nargs="+", help="Command to run") arg_parser.add_argument( "+db_model", @@ -317,7 +327,7 @@ def register_signal_handlers() -> None: LOCK.acquire(timeout=0.1) logger.debug(f"Starting colocated database on host: {socket.gethostname()}") - # make sure to register the cleanup before the start + # make sure to register the cleanup before we start # the proecss so our signaller will be able to stop # the database process. register_signal_handlers() @@ -328,6 +338,7 @@ def register_signal_handlers() -> None: parsed_args.command, parsed_args.db_model, parsed_args.db_script, + parsed_args.db_identifier, ) # gracefully exit the processes in the distributed application that diff --git a/smartsim/_core/entrypoints/indirect.py b/smartsim/_core/entrypoints/indirect.py new file mode 100644 index 000000000..18d27601f --- /dev/null +++ b/smartsim/_core/entrypoints/indirect.py @@ -0,0 +1,242 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import logging +import os +import pathlib +import signal +import sys +import typing as t +from types import FrameType + +import coloredlogs +import psutil + +import smartsim.log +from smartsim._core.entrypoints.telemetrymonitor import track_event +from smartsim._core.utils.helpers import decode_cmd, get_ts + +STEP_PID: t.Optional[int] = None +logger = smartsim.log.get_logger(__name__) + +# kill is not catchable +SIGNALS = [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT, signal.SIGABRT] + + +def main( + cmd: str, + etype: str, + cwd: str, + status_dir: str, +) -> int: + """The main function of the entrypoint. This function takes an encoded step + command and runs it in a subprocess. In the background, this entrypoint + will then monitor the subprocess and write out status events such as when + the subprocess has started or stopped and write these events to a status + directory. + """ + global STEP_PID # pylint: disable=global-statement + proxy_pid = os.getpid() + + status_path = pathlib.Path(status_dir) + if not status_path.exists(): + status_path.mkdir(parents=True, exist_ok=True) + + if not cmd.strip(): + raise ValueError("Invalid cmd supplied") + + cleaned_cmd = decode_cmd(cmd) + ret_code: int = 1 + logger.debug("Indirect step starting") + + start_detail = f"Proxy process {proxy_pid}" + start_rc: t.Optional[int] = None + + try: + process = psutil.Popen( + cleaned_cmd, + cwd=cwd, + stdout=sys.stdout, + stderr=sys.stderr, + ) + STEP_PID = process.pid + logger.info(f"Indirect proxy {proxy_pid} child process {STEP_PID} started") + start_detail += f" started child process {STEP_PID}" + + except Exception as ex: + start_detail += f" failed to start child process. {ex}" + start_rc = 1 + logger.error("Failed to create process", exc_info=True) + cleanup() + return 1 + finally: + track_event( + get_ts(), + proxy_pid, + "", # step_id for unmanaged task is always empty + etype, + "start", + status_path, + logger, + detail=start_detail, + return_code=start_rc, + ) + + logger.info(f"Waiting for child process {STEP_PID} to complete") + ret_code = process.wait() + + logger.info( + f"Indirect proxy {proxy_pid} child process {STEP_PID} complete." + f" return code: {ret_code}" + ) + msg = f"Process {STEP_PID} finished with return code: {ret_code}" + track_event( + get_ts(), + proxy_pid, + "", # step_id for unmanaged task is always empty + etype, + "stop", + status_path, + logger, + detail=msg, + return_code=ret_code, + ) + cleanup() + + return ret_code + + +def cleanup() -> None: + """Perform cleanup required for clean termination""" + logger.info("Performing cleanup") + global STEP_PID # pylint: disable=global-statement + if STEP_PID is None: + return + + try: + # attempt to stop the subprocess performing step-execution + if psutil.pid_exists(STEP_PID): + process = psutil.Process(STEP_PID) + process.terminate() + except psutil.NoSuchProcess: + # swallow exception to avoid overwriting outputs from cmd + ... + + except OSError as ex: + logger.warning(f"Failed to clean up step executor gracefully: {ex}") + finally: + STEP_PID = None + + +def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: + """Helper function to ensure clean process termination""" + logger.info(f"handling signal {signo}") + if not signo: + logger.warning("Received signal with no signo") + + cleanup() + + +def register_signal_handlers() -> None: + """Register a signal handling function for all termination events""" + for sig in SIGNALS: + signal.signal(sig, handle_signal) + + +def get_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prefix_chars="+", description="SmartSim Step Executor" + ) + parser.add_argument( + "+name", type=str, help="Name of the step being executed", required=True + ) + parser.add_argument( + "+command", type=str, help="The command to execute", required=True + ) + parser.add_argument( + "+entity_type", + type=str, + help="The type of entity related to the step", + required=True, + ) + parser.add_argument( + "+working_dir", + type=str, + help="The working directory of the executable", + required=True, + ) + parser.add_argument( + "+telemetry_dir", + type=str, + help="Directory for telemetry output", + required=True, + ) + return parser + + +if __name__ == "__main__": + arg_parser = get_parser() + os.environ["PYTHONUNBUFFERED"] = "1" + parsed_args = arg_parser.parse_args() + + # Set up a local private logger for when this module is run as an entry point + level = logger.getEffectiveLevel() + logger = logging.getLogger(f"{__name__}.{parsed_args.name}") + logger.propagate = False + logger.setLevel(level) + + fh = logging.FileHandler(f"{parsed_args.name}.indirect.log") + coloredlogs.HostNameFilter.install(fh) + fh.setFormatter( + logging.Formatter( + smartsim.log.DEFAULT_LOG_FORMAT, + datefmt=smartsim.log.DEFAULT_DATE_FORMAT, + ) + ) + logger.addHandler(fh) + + try: + logger.debug("Starting indirect step execution") + + # make sure to register the cleanup before the start the process + # so our signaller will be able to stop the database process. + register_signal_handlers() + + rc = main( + cmd=parsed_args.command, + etype=parsed_args.entity_type, + cwd=parsed_args.working_dir, + status_dir=parsed_args.telemetry_dir, + ) + sys.exit(rc) + + # gracefully exit the processes in the distributed application that + # we do not want to have start a colocated process. Only one process + # per node should be running. + except Exception as e: + logger.exception(f"An unexpected error caused step execution to fail: {e}") + sys.exit(1) diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index 782b4c583..ef9911829 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -25,16 +25,20 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import json import os -import psutil import signal +import textwrap import typing as t +from subprocess import PIPE, STDOUT +from types import FrameType + +import psutil from smartsim._core.utils.network import current_ip +from smartsim.entity.dbnode import LaunchedShardData from smartsim.error import SSInternalError from smartsim.log import get_logger -from subprocess import PIPE, STDOUT -from types import FrameType logger = get_logger(__name__) @@ -42,7 +46,7 @@ Redis/KeyDB entrypoint script """ -DBPID = None +DBPID: t.Optional[int] = None # kill is not catchable SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] @@ -54,31 +58,62 @@ def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: cleanup() -def build_bind_args(ip_addresses: t.List[str]) -> t.List[str]: - bind_arg = f"--bind {' '.join(ip_addresses)}" - # pin source address to avoid random selection by Redis - bind_src_arg = f"--bind-source-addr {ip_addresses[0]}" - return [bind_arg, bind_src_arg] +def build_bind_args(source_addr: str, *addrs: str) -> t.Tuple[str, ...]: + return ( + "--bind", + source_addr, + *addrs, + # pin source address to avoid random selection by Redis + "--bind-source-addr", + source_addr, + ) -def print_summary(cmd: t.List[str], ip_address: str, network_interface: str) -> None: - print("-" * 10, " Running Command ", "-" * 10, "\n", flush=True) - print(f"COMMAND: {' '.join(cmd)}\n", flush=True) - print(f"IPADDRESS: {ip_address}\n", flush=True) - print(f"NETWORK: {network_interface}\n", flush=True) - print("-" * 30, "\n\n", flush=True) - print("-" * 10, " Output ", "-" * 10, "\n\n", flush=True) +def build_cluster_args(shard_data: LaunchedShardData) -> t.Tuple[str, ...]: + if cluster_conf_file := shard_data.cluster_conf_file: + return ("--cluster-enabled", "yes", "--cluster-config-file", cluster_conf_file) + return () -def main(network_interface: str, command: t.List[str]) -> None: +def print_summary( + cmd: t.List[str], network_interface: str, shard_data: LaunchedShardData +) -> None: + print( + textwrap.dedent(f"""\ + ----------- Running Command ---------- + COMMAND: {' '.join(cmd)} + IPADDRESS: {shard_data.hostname} + NETWORK: {network_interface} + SMARTSIM_ORC_SHARD_INFO: {json.dumps(shard_data.to_dict())} + -------------------------------------- + + --------------- Output --------------- + + """), + flush=True, + ) + + +def main(args: argparse.Namespace) -> int: global DBPID # pylint: disable=global-statement - try: - ip_addresses = [current_ip(net_if) for net_if in network_interface.split(",")] - cmd = command + build_bind_args(ip_addresses) + src_addr, *bind_addrs = (current_ip(net_if) for net_if in args.ifname.split(",")) + shard_data = LaunchedShardData( + name=args.name, hostname=src_addr, port=args.port, cluster=args.cluster + ) - print_summary(cmd, ip_addresses[0], network_interface) + cmd = [ + args.orc_exe, + args.conf_file, + *args.rai_module, + "--port", + str(args.port), + *build_cluster_args(shard_data), + *build_bind_args(src_addr, *bind_addrs), + ] + print_summary(cmd, args.ifname, shard_data) + try: process = psutil.Popen(cmd, stdout=PIPE, stderr=STDOUT) DBPID = process.pid @@ -87,18 +122,17 @@ def main(network_interface: str, command: t.List[str]) -> None: except Exception as e: cleanup() raise SSInternalError("Database process starter raised an exception") from e + return 0 def cleanup() -> None: + logger.debug("Cleaning up database instance") try: - logger.debug("Cleaning up database instance") # attempt to stop the database process - db_proc = psutil.Process(DBPID) - db_proc.terminate() - + if DBPID is not None: + psutil.Process(DBPID).terminate() except psutil.NoSuchProcess: logger.warning("Couldn't find database process to kill.") - except OSError as e: logger.warning(f"Failed to clean up database gracefully: {str(e)}") @@ -110,15 +144,47 @@ def cleanup() -> None: prefix_chars="+", description="SmartSim Process Launcher" ) parser.add_argument( - "+ifname", type=str, help="Network Interface name", default="lo" + "+orc-exe", type=str, help="Path to the orchestrator executable", required=True + ) + parser.add_argument( + "+conf-file", + type=str, + help="Path to the orchestrator configuration file", + required=True, + ) + parser.add_argument( + "+rai-module", + nargs="+", + type=str, + help=( + "Command for the orcestrator to load the Redis AI module with " + "symbols seperated by whitespace" + ), + required=True, + ) + parser.add_argument( + "+name", type=str, help="Name to identify the shard", required=True + ) + parser.add_argument( + "+port", + type=int, + help="The port on which to launch the shard of the orchestrator", + required=True, + ) + parser.add_argument( + "+ifname", type=str, help="Network Interface name", required=True + ) + parser.add_argument( + "+cluster", + action="store_true", + help="Specify if this orchestrator shard is part of a cluster", ) - parser.add_argument("+command", nargs="+", help="Command to run") - args = parser.parse_args() + args_ = parser.parse_args() # make sure to register the cleanup before the start - # the proecss so our signaller will be able to stop + # the process so our signaller will be able to stop # the database process. for sig in SIGNALS: signal.signal(sig, handle_signal) - main(args.ifname, args.command) + raise SystemExit(main(args_)) diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py new file mode 100644 index 000000000..86d6fe72f --- /dev/null +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -0,0 +1,690 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import json +import logging +import os +import pathlib +import signal +import sys +import threading +import time +import typing as t +from dataclasses import dataclass, field +from types import FrameType + +from watchdog.events import ( + FileCreatedEvent, + FileModifiedEvent, + LoggingEventHandler, + PatternMatchingEventHandler, +) +from watchdog.observers import Observer +from watchdog.observers.api import BaseObserver + +from smartsim._core.config import CONFIG +from smartsim._core.control.job import JobEntity, _JobKey +from smartsim._core.control.jobmanager import JobManager +from smartsim._core.launcher.cobalt.cobaltLauncher import CobaltLauncher +from smartsim._core.launcher.launcher import Launcher +from smartsim._core.launcher.local.local import LocalLauncher +from smartsim._core.launcher.lsf.lsfLauncher import LSFLauncher +from smartsim._core.launcher.pbs.pbsLauncher import PBSLauncher +from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher +from smartsim._core.launcher.stepInfo import StepInfo +from smartsim._core.utils.helpers import get_ts +from smartsim._core.utils.serialize import MANIFEST_FILENAME, TELMON_SUBDIR +from smartsim.error.errors import SmartSimError +from smartsim.status import STATUS_COMPLETED, TERMINAL_STATUSES + +"""Telemetry Monitor entrypoint""" + +# kill is not catchable +SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] +_EventClass = t.Literal["start", "stop", "timestep"] +_MAX_MANIFEST_LOAD_ATTEMPTS: t.Final[int] = 6 + + +@dataclass +class Run: + """Model containing entities of an individual start call for an experiment""" + + timestamp: int + models: t.List[JobEntity] + orchestrators: t.List[JobEntity] + ensembles: t.List[JobEntity] + + def flatten( + self, filter_fn: t.Optional[t.Callable[[JobEntity], bool]] = None + ) -> t.List[JobEntity]: + """Flatten runs into a list of SmartSimEntity run events""" + entities = self.models + self.orchestrators + self.ensembles + if filter_fn: + entities = [entity for entity in entities if filter_fn(entity)] + return entities + + +@dataclass +class RuntimeManifest: + """The runtime manifest holds meta information about the experiment entities created + at runtime to satisfy the experiment requirements. + """ + + name: str + path: pathlib.Path + launcher: str + runs: t.List[Run] = field(default_factory=list) + + +def _hydrate_persistable( + persistable_entity: t.Dict[str, t.Any], + entity_type: str, + exp_dir: str, +) -> JobEntity: + """Populate JobEntity instance with supplied metdata and instance details""" + entity = JobEntity() + + metadata = persistable_entity["telemetry_metadata"] + status_dir = pathlib.Path(metadata.get("status_dir")) + + entity.type = entity_type + entity.name = persistable_entity["name"] + entity.step_id = str(metadata.get("step_id") or "") + entity.task_id = str(metadata.get("task_id") or "") + entity.timestamp = int(persistable_entity.get("timestamp", "0")) + entity.path = str(exp_dir) + entity.status_dir = str(status_dir) + + return entity + + +def hydrate_persistable( + entity_type: str, + persistable_entity: t.Dict[str, t.Any], + exp_dir: pathlib.Path, +) -> t.List[JobEntity]: + """Map entity data persisted in a manifest file to an object""" + entities = [] + + # an entity w/parent key creates persistables for entities it contains + parent_keys = {"shards", "models"} + parent_keys = parent_keys.intersection(persistable_entity.keys()) + if parent_keys: + container = "shards" if "shards" in parent_keys else "models" + child_type = "orchestrator" if container == "shards" else "model" + for child_entity in persistable_entity[container]: + entity = _hydrate_persistable(child_entity, child_type, str(exp_dir)) + entities.append(entity) + + return entities + + entity = _hydrate_persistable(persistable_entity, entity_type, str(exp_dir)) + entities.append(entity) + return entities + + +def hydrate_persistables( + entity_type: str, + run: t.Dict[str, t.Any], + exp_dir: pathlib.Path, +) -> t.Dict[str, t.List[JobEntity]]: + """Map a collection of entity data persisted in a manifest file to an object""" + persisted: t.Dict[str, t.List[JobEntity]] = { + "model": [], + "orchestrator": [], + } + for item in run[entity_type]: + entities = hydrate_persistable(entity_type, item, exp_dir) + for new_entity in entities: + persisted[new_entity.type].append(new_entity) + + return persisted + + +def hydrate_runs( + persisted_runs: t.List[t.Dict[str, t.Any]], exp_dir: pathlib.Path +) -> t.List[Run]: + """Map run data persisted in a manifest file to an object""" + the_runs: t.List[Run] = [] + for run_instance in persisted_runs: + run_entities: t.Dict[str, t.List[JobEntity]] = { + "model": [], + "orchestrator": [], + "ensemble": [], + } + + for key in run_entities: + _entities = hydrate_persistables(key, run_instance, exp_dir) + for entity_type, new_entities in _entities.items(): + if new_entities: + run_entities[entity_type].extend(new_entities) + + run = Run( + run_instance["timestamp"], + run_entities["model"], + run_entities["orchestrator"], + run_entities["ensemble"], + ) + the_runs.append(run) + + return the_runs + + +def load_manifest(file_path: str) -> t.Optional[RuntimeManifest]: + """Load a persisted manifest and return the content""" + manifest_dict: t.Optional[t.Dict[str, t.Any]] = None + try_count = 1 + + while manifest_dict is None and try_count < _MAX_MANIFEST_LOAD_ATTEMPTS: + source = pathlib.Path(file_path) + source = source.resolve() + + try: + if text := source.read_text(encoding="utf-8").strip(): + manifest_dict = json.loads(text) + except json.JSONDecodeError as ex: + print(f"Error loading manifest: {ex}") + # hack/fix: handle issues reading file before it is fully written + time.sleep(0.5 * try_count) + finally: + try_count += 1 + + if not manifest_dict: + return None + + exp = manifest_dict.get("experiment", None) + if not exp: + raise ValueError("Manifest missing required experiment") + + runs = manifest_dict.get("runs", None) + if runs is None: + raise ValueError("Manifest missing required runs") + + exp_dir = pathlib.Path(exp["path"]) + runs = hydrate_runs(runs, exp_dir) + + manifest = RuntimeManifest( + name=exp["name"], + path=exp_dir, + launcher=exp["launcher"], + runs=runs, + ) + return manifest + + +def track_event( + timestamp: int, + task_id: t.Union[int, str], + step_id: str, + etype: str, + action: _EventClass, + status_dir: pathlib.Path, + logger: logging.Logger, + detail: str = "", + return_code: t.Optional[int] = None, +) -> None: + """Persist a tracking event for an entity""" + tgt_path = status_dir / f"{action}.json" + tgt_path.parent.mkdir(parents=True, exist_ok=True) + + try: + task_id = int(task_id) + except ValueError: + pass + + entity_dict = { + "timestamp": timestamp, + "job_id": task_id, + "step_id": step_id, + "type": etype, + "action": action, + } + + if detail is not None: + entity_dict["detail"] = detail + + if return_code is not None: + entity_dict["return_code"] = return_code + + try: + if not tgt_path.exists(): + # Don't overwrite existing tracking files + bytes_written = tgt_path.write_text(json.dumps(entity_dict, indent=2)) + if bytes_written < 1: + logger.warning("event tracking failed to write tracking file.") + except Exception: + logger.error("Unable to write tracking file.", exc_info=True) + + +def faux_return_code(step_info: StepInfo) -> t.Optional[int]: + """Create a faux return code for a task run by the WLM. Must not be + called with non-terminal statuses or results may be confusing + """ + if step_info.status not in TERMINAL_STATUSES: + return None + + if step_info.status == STATUS_COMPLETED: + return os.EX_OK + + return 1 + + +class ManifestEventHandler(PatternMatchingEventHandler): + """The ManifestEventHandler monitors an experiment for changes and updates + a telemetry datastore as needed. + + It contains event handlers that are triggered by changes to a runtime experiment + manifest. The runtime manifest differs from a standard manifest. A runtime manifest + may contain multiple experiment executions in a `runs` collection. + + It also contains a long-polling loop that checks experiment entities for updates + at each timestep. + """ + + def __init__( + self, + pattern: str, + logger: logging.Logger, + ignore_patterns: t.Any = None, + ignore_directories: bool = True, + case_sensitive: bool = False, + ) -> None: + super().__init__( + [pattern], ignore_patterns, ignore_directories, case_sensitive + ) # type: ignore + self._logger = logger + self._tracked_runs: t.Dict[int, Run] = {} + self._tracked_jobs: t.Dict[_JobKey, JobEntity] = {} + self._completed_jobs: t.Dict[_JobKey, JobEntity] = {} + self._launcher: t.Optional[Launcher] = None + self.job_manager: JobManager = JobManager(threading.RLock()) + self._launcher_map: t.Dict[str, t.Type[Launcher]] = { + "slurm": SlurmLauncher, + "pbs": PBSLauncher, + "cobalt": CobaltLauncher, + "lsf": LSFLauncher, + "local": LocalLauncher, + } + + def init_launcher(self, launcher: str) -> Launcher: + """Initialize the controller with a specific type of launcher. + SmartSim currently supports slurm, pbs(pro), cobalt, lsf, + and local launching + + :param launcher: which launcher to initialize + :type launcher: str + :raises SSUnsupportedError: if a string is passed that is not + a supported launcher + :raises TypeError: if no launcher argument is provided. + """ + if not launcher: + raise TypeError("Must provide a 'launcher' argument") + + if launcher_type := self._launcher_map.get(launcher.lower(), None): + return launcher_type() + + raise ValueError("Launcher type not supported: " + launcher) + + def set_launcher(self, launcher_type: str) -> None: + """Set the launcher for the experiment""" + self._launcher = self.init_launcher(launcher_type) + self.job_manager.set_launcher(self._launcher) + self.job_manager.start() + + def process_manifest(self, manifest_path: str) -> None: + """Read the runtime manifest for the experiment and track new entities + + :param manifest_path: The full path to the manifest file + :type manifest_path: str + """ + try: + manifest = load_manifest(manifest_path) + if not manifest: + return + except json.JSONDecodeError: + self._logger.error(f"Malformed manifest encountered: {manifest_path}") + return + except ValueError: + self._logger.error("Manifest content error", exc_info=True) + return + + if self._launcher is None: + self.set_launcher(manifest.launcher) + + if not self._launcher: + raise SmartSimError(f"Unable to set launcher from {manifest_path}") + + runs = [run for run in manifest.runs if run.timestamp not in self._tracked_runs] + + exp_dir = pathlib.Path(manifest_path).parent.parent.parent + + for run in runs: + for entity in run.flatten( + filter_fn=lambda e: e.key not in self._tracked_jobs and e.is_managed + ): + entity.path = str(exp_dir) + + self._tracked_jobs[entity.key] = entity + track_event( + run.timestamp, + entity.task_id, + entity.step_id, + entity.type, + "start", + pathlib.Path(entity.status_dir), + self._logger, + ) + + if entity.is_managed: + self.job_manager.add_job( + entity.name, + entity.task_id, + entity, + False, + ) + self._launcher.step_mapping.add( + entity.name, entity.step_id, entity.task_id, True + ) + self._tracked_runs[run.timestamp] = run + + def on_modified(self, event: FileModifiedEvent) -> None: + """Event handler for when a file or directory is modified. + + :param event: Event representing file/directory modification. + :type event: FileModifiedEvent + """ + super().on_modified(event) # type: ignore + self._logger.info(f"processing manifest modified @ {event.src_path}") + self.process_manifest(event.src_path) + + def on_created(self, event: FileCreatedEvent) -> None: + """Event handler for when a file or directory is created. + + :param event: Event representing file/directory creation. + :type event: FileCreatedEvent + """ + super().on_created(event) # type: ignore + self._logger.info(f"processing manifest created @ {event.src_path}") + self.process_manifest(event.src_path) + + def _to_completed( + self, + timestamp: int, + entity: JobEntity, + step_info: StepInfo, + ) -> None: + """Move a monitored entity from the active to completed collection to + stop monitoring for updates during timesteps. + + :param timestamp: the current timestamp for event logging + :type timestamp: int + :param entity: the running SmartSim Job + :type entity: JobEntity + :param experiment_dir: the experiement directory to monitor for changes + :type experiment_dir: pathlib.Path + :param entity: the StepInfo received when requesting a Job status update + :type entity: StepInfo + """ + inactive_entity = self._tracked_jobs.pop(entity.key) + if entity.key not in self._completed_jobs: + self._completed_jobs[entity.key] = inactive_entity + + job = self.job_manager[entity.name] + self.job_manager.move_to_completed(job) + + status_clause = f"status: {step_info.status}" + error_clause = f", error: {step_info.error}" if step_info.error else "" + detail = f"{status_clause}{error_clause}" + + if hasattr(job.entity, "status_dir"): + write_path = pathlib.Path(job.entity.status_dir) + + track_event( + timestamp, + entity.task_id, + entity.step_id, + entity.type, + "stop", + write_path, + self._logger, + detail=detail, + return_code=faux_return_code(step_info), + ) + + def on_timestep(self, timestamp: int) -> None: + """Called at polling frequency to request status updates on + monitored entities + + :param timestamp: the current timestamp for event logging + :type timestamp: int + :param experiment_dir: the experiement directory to monitor for changes + :type experiment_dir: pathlib.Path + """ + entity_map = self._tracked_jobs + + if not self._launcher: + return + + # consider not using name to avoid collisions + names = {entity.name: entity for entity in entity_map.values()} + + if names: + step_updates = self._launcher.get_step_update(list(names.keys())) + + for step_name, step_info in step_updates: + if step_info and step_info.status in TERMINAL_STATUSES: + completed_entity = names[step_name] + self._to_completed(timestamp, completed_entity, step_info) + + +def can_shutdown(action_handler: ManifestEventHandler, logger: logging.Logger) -> bool: + jobs = action_handler.job_manager.jobs + db_jobs = action_handler.job_manager.db_jobs + + has_jobs = bool(jobs) + has_dbs = bool(db_jobs) + has_running_jobs = has_jobs or has_dbs + + if has_jobs: + logger.debug(f"telemetry monitor is monitoring {len(jobs)} jobs") + if has_dbs: + logger.debug(f"telemetry monitor is monitoring {len(db_jobs)} dbs") + + return not has_running_jobs + + +def event_loop( + observer: BaseObserver, + action_handler: ManifestEventHandler, + frequency: t.Union[int, float], + logger: logging.Logger, + cooldown_duration: int, +) -> None: + """Executes all attached timestep handlers every seconds + + :param observer: (optional) a preconfigured watchdog Observer to inject + :type observer: t.Optional[BaseObserver] + :param action_handler: The manifest event processor instance + :type action_handler: ManifestEventHandler + :param frequency: frequency (in seconds) of update loop + :type frequency: t.Union[int, float] + :param logger: a preconfigured Logger instance + :type logger: logging.Logger + :param cooldown_duration: number of seconds the telemetry monitor should + poll for new jobs before attempting to shutdown + :type cooldown_duration: int + """ + elapsed: int = 0 + last_ts: int = get_ts() + + while observer.is_alive(): + timestamp = get_ts() + logger.debug(f"Telemetry timestep: {timestamp}") + action_handler.on_timestep(timestamp) + + elapsed += timestamp - last_ts + last_ts = timestamp + + if can_shutdown(action_handler, logger): + if elapsed >= cooldown_duration: + logger.info("beginning telemetry manager shutdown") + observer.stop() # type: ignore + else: + # reset cooldown any time there are still jobs running + elapsed = 0 + + time.sleep(frequency) + + +def main( + frequency: t.Union[int, float], + experiment_dir: pathlib.Path, + logger: logging.Logger, + observer: t.Optional[BaseObserver] = None, + cooldown_duration: t.Optional[int] = 0, +) -> int: + """Setup the monitoring entities and start the timer-based loop that + will poll for telemetry data + + :param frequency: frequency (in seconds) of update loop + :type frequency: t.Union[int, float] + :param experiment_dir: the experiement directory to monitor for changes + :type experiment_dir: pathlib.Path + :param logger: a preconfigured Logger instance + :type logger: logging.Logger + :param observer: (optional) a preconfigured Observer to inject + :type observer: t.Optional[BaseObserver] + :param cooldown_duration: number of seconds the telemetry monitor should + poll for new jobs before attempting to shutdown + :type cooldown_duration: int + """ + manifest_relpath = pathlib.Path(TELMON_SUBDIR) / MANIFEST_FILENAME + manifest_path = experiment_dir / manifest_relpath + monitor_pattern = str(manifest_relpath) + + logger.info( + f"Executing telemetry monitor with frequency: {frequency}s" + f", on target directory: {experiment_dir}" + f" matching pattern: {monitor_pattern}" + ) + + cooldown_duration = cooldown_duration or CONFIG.telemetry_cooldown + log_handler = LoggingEventHandler(logger) # type: ignore + action_handler = ManifestEventHandler(monitor_pattern, logger) + + if observer is None: + observer = Observer() + + try: + if manifest_path.exists(): + # a manifest may not exist depending on startup timing + action_handler.process_manifest(str(manifest_path)) + + observer.schedule(log_handler, experiment_dir, recursive=True) # type:ignore + observer.schedule(action_handler, experiment_dir, recursive=True) # type:ignore + observer.start() # type: ignore + + event_loop(observer, action_handler, frequency, logger, cooldown_duration) + return os.EX_OK + except Exception as ex: + logger.error(ex) + finally: + if observer.is_alive(): + observer.stop() # type: ignore + observer.join() + + return os.EX_SOFTWARE + + +def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: + """Helper function to ensure clean process termination""" + if not signo: + logger = logging.getLogger() + logger.warning("Received signal with no signo") + + +def register_signal_handlers() -> None: + """Register a signal handling function for all termination events""" + for sig in SIGNALS: + signal.signal(sig, handle_signal) + + +def get_parser() -> argparse.ArgumentParser: + """Instantiate a parser to process command line arguments""" + arg_parser = argparse.ArgumentParser(description="SmartSim Telemetry Monitor") + arg_parser.add_argument( + "-frequency", + type=int, + help="Frequency of telemetry updates (in seconds))", + required=True, + ) + arg_parser.add_argument( + "-exp_dir", + type=str, + help="Experiment root directory", + required=True, + ) + arg_parser.add_argument( + "-cooldown", + type=int, + help="Default lifetime of telemetry monitor (in seconds) before auto-shutdown", + default=CONFIG.telemetry_cooldown, + ) + return arg_parser + + +if __name__ == "__main__": + os.environ["PYTHONUNBUFFERED"] = "1" + + parser = get_parser() + args = parser.parse_args() + + log = logging.getLogger(f"{__name__}.TelemetryMonitor") + log.setLevel(logging.DEBUG) + log.propagate = False + + log_path = os.path.join(args.exp_dir, TELMON_SUBDIR, "telemetrymonitor.log") + fh = logging.FileHandler(log_path, "a") + log.addHandler(fh) + + # Must register cleanup before the main loop is running + register_signal_handlers() + + try: + main( + int(args.frequency), + pathlib.Path(args.exp_dir), + log, + cooldown_duration=args.cooldown, + ) + sys.exit(0) + except Exception: + log.exception( + "Shutting down telemetry monitor due to unexpected error", exc_info=True + ) + + sys.exit(1) diff --git a/smartsim/_core/generation/generator.py b/smartsim/_core/generation/generator.py index 86c96f4e9..79cea06b7 100644 --- a/smartsim/_core/generation/generator.py +++ b/smartsim/_core/generation/generator.py @@ -27,17 +27,19 @@ import pathlib import shutil import typing as t - +from datetime import datetime from distutils import dir_util # pylint: disable=deprecated-module +from logging import DEBUG, INFO from os import mkdir, path, symlink +from os.path import join, relpath + +from tabulate import tabulate -from ...entity import Model, TaggedFilesHierarchy +from ...database import Orchestrator +from ...entity import Ensemble, Model, TaggedFilesHierarchy from ...log import get_logger from ..control import Manifest from .modelwriter import ModelWriter -from ...database import Orchestrator -from ...entity import Ensemble - logger = get_logger(__name__) logger.propagate = False @@ -49,7 +51,9 @@ class Generator: and writing into configuration files as well. """ - def __init__(self, gen_path: str, overwrite: bool = False) -> None: + def __init__( + self, gen_path: str, overwrite: bool = False, verbose: bool = True + ) -> None: """Initialize a generator object if overwrite is true, replace any existing @@ -59,12 +63,28 @@ def __init__(self, gen_path: str, overwrite: bool = False) -> None: is false, raises EntityExistsError when there is a name collision between entities. + :param gen_path: Path in which files need to be generated + :type gen_path: str :param overwrite: toggle entity replacement, defaults to False :type overwrite: bool, optional + :param verbose: Whether generation information should be logged to std out + :type verbose: bool, optional """ self._writer = ModelWriter() self.gen_path = gen_path self.overwrite = overwrite + self.log_level = DEBUG if not verbose else INFO + + @property + def log_file(self) -> str: + """Returns the location of the file + summarizing the parameters used for the last generation + of all generated entities. + + :returns: path to file with parameter settings + :rtype: str + """ + return join(self.gen_path, "smartsim_params.txt") def generate_experiment(self, *args: t.Any) -> None: """Run ensemble and experiment file structure generation @@ -87,8 +107,9 @@ def generate_experiment(self, *args: t.Any) -> None: """ generator_manifest = Manifest(*args) + self._gen_exp_dir() - self._gen_orc_dir(generator_manifest.db) + self._gen_orc_dir(generator_manifest.dbs) self._gen_entity_list_dir(generator_manifest.ensembles) self._gen_entity_dirs(generator_manifest.models) @@ -129,31 +150,39 @@ def _gen_exp_dir(self) -> None: # keep exists ok for race conditions on NFS pathlib.Path(self.gen_path).mkdir(exist_ok=True) else: - logger.info("Working in previously created experiment") + logger.log( + level=self.log_level, msg="Working in previously created experiment" + ) - def _gen_orc_dir(self, orchestrator: t.Optional[Orchestrator]) -> None: + # The log_file only keeps track of the last generation + # this is to avoid gigantic files in case the user repeats + # generation several times. The information is anyhow + # redundant, as it is also written in each entity's dir + with open(self.log_file, mode="w", encoding="utf-8") as log_file: + dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S") + log_file.write(f"Generation start date and time: {dt_string}\n") + + def _gen_orc_dir(self, orchestrator_list: t.List[Orchestrator]) -> None: """Create the directory that will hold the error, output and configuration files for the orchestrator. :param orchestrator: Orchestrator instance :type orchestrator: Orchestrator | None """ + # Loop through orchestrators + for orchestrator in orchestrator_list: + orc_path = path.join(self.gen_path, orchestrator.name) - if not orchestrator: - return - - orc_path = path.join(self.gen_path, "database") - orchestrator.set_path(orc_path) - - # Always remove orchestrator files if present. - if path.isdir(orc_path): - shutil.rmtree(orc_path, ignore_errors=True) - pathlib.Path(orc_path).mkdir(exist_ok=True) + orchestrator.set_path(orc_path) + # Always remove orchestrator files if present. + if path.isdir(orc_path): + shutil.rmtree(orc_path, ignore_errors=True) + pathlib.Path(orc_path).mkdir(exist_ok=self.overwrite) def _gen_entity_list_dir(self, entity_lists: t.List[Ensemble]) -> None: - """Generate directories for EntityList instances + """Generate directories for Ensemble instances - :param entity_lists: list of EntityList instances + :param entity_lists: list of Ensemble instances :type entity_lists: list """ @@ -249,10 +278,55 @@ def _build_tagged_files(tagged: TaggedFilesHierarchy) -> None: # write in changes to configurations if isinstance(entity, Model): - logger.debug( - f"Configuring model {entity.name} with params {entity.params}" + files_to_params = self._writer.configure_tagged_model_files( + to_write, entity.params ) - self._writer.configure_tagged_model_files(to_write, entity.params) + self._log_params(entity, files_to_params) + + def _log_params( + self, entity: Model, files_to_params: t.Dict[str, t.Dict[str, str]] + ) -> None: + """Log which files were modified during generation + + and what values were set to the parameters + + :param entity: the model being generated + :type entity: Model + :param files_to_params: a dict connecting each file to its parameter settings + :type files_to_params: t.Dict[str, t.Dict[str, str]] + """ + used_params: t.Dict[str, str] = {} + file_to_tables: t.Dict[str, str] = {} + for file, params in files_to_params.items(): + used_params.update(params) + table = tabulate(params.items(), headers=["Name", "Value"]) + file_to_tables[relpath(file, self.gen_path)] = table + + if used_params: + used_params_str = ", ".join( + [f"{name}={value}" for name, value in used_params.items()] + ) + logger.log( + level=self.log_level, + msg=f"Configured model {entity.name} with params {used_params_str}", + ) + file_table = tabulate( + file_to_tables.items(), + headers=["File name", "Parameters"], + ) + log_entry = f"Model name: {entity.name}\n{file_table}\n\n" + with open(self.log_file, mode="a", encoding="utf-8") as logfile: + logfile.write(log_entry) + with open( + join(entity.path, "smartsim_params.txt"), mode="w", encoding="utf-8" + ) as local_logfile: + local_logfile.write(log_entry) + + else: + logger.log( + level=self.log_level, + msg=f"Configured model {entity.name} with no parameters", + ) @staticmethod def _copy_entity_files(entity: Model) -> None: diff --git a/smartsim/_core/generation/modelwriter.py b/smartsim/_core/generation/modelwriter.py index eb0e389aa..0cf071082 100644 --- a/smartsim/_core/generation/modelwriter.py +++ b/smartsim/_core/generation/modelwriter.py @@ -63,7 +63,7 @@ def configure_tagged_model_files( tagged_files: t.List[str], params: t.Dict[str, str], make_missing_tags_fatal: bool = False, - ) -> None: + ) -> t.Dict[str, t.Dict[str, str]]: """Read, write and configure tagged files attached to a Model instance. @@ -71,13 +71,19 @@ def configure_tagged_model_files( :type model: list[str] :param params: model parameters :type params: dict[str, str] - :param make_missing_tags_fatal: blow up if a tag is missing + :param make_missing_tags_fatal: raise an error if a tag is missing :type make_missing_tags_fatal: bool + :returns: A dict connecting each file to its parameter settings + :rtype: dict[str,dict[str,str]] """ + files_to_tags: t.Dict[str, t.Dict[str, str]] = {} for tagged_file in tagged_files: self._set_lines(tagged_file) - self._replace_tags(params, make_missing_tags_fatal) + used_tags = self._replace_tags(params, make_missing_tags_fatal) self._write_changes(tagged_file) + files_to_tags[tagged_file] = used_tags + + return files_to_tags def _set_lines(self, file_path: str) -> None: """Set the lines for the modelwrtter to iterate over @@ -104,8 +110,10 @@ def _write_changes(self, file_path: str) -> None: except (IOError, OSError) as e: raise ParameterWriterError(file_path, read=False) from e - def _replace_tags(self, params: t.Dict[str, str], make_fatal: bool = False) -> None: - """Replace the tagged within the tagged file attached to this + def _replace_tags( + self, params: t.Dict[str, str], make_fatal: bool = False + ) -> t.Dict[str, str]: + """Replace the tagged parameters within the file attached to this model. The tag defaults to ";" :param model: The model instance @@ -113,9 +121,12 @@ def _replace_tags(self, params: t.Dict[str, str], make_fatal: bool = False) -> N :param make_fatal: (Optional) Set to True to force a fatal error if a tag is not matched :type make_fatal: bool + :returns: A dict of parameter names and values set for the file + :rtype: dict[str,str] """ edited = [] unused_tags: t.Dict[str, t.List[int]] = {} + used_params: t.Dict[str, str] = {} for i, line in enumerate(self.lines): search = re.search(self.regex, line) if search: @@ -126,6 +137,7 @@ def _replace_tags(self, params: t.Dict[str, str], make_fatal: bool = False) -> N new_val = str(params[previous_value]) new_line = re.sub(self.regex, new_val, line, 1) search = re.search(self.regex, new_line) + used_params[previous_value] = new_val if not search: edited.append(new_line) else: @@ -143,13 +155,12 @@ def _replace_tags(self, params: t.Dict[str, str], make_fatal: bool = False) -> N else: edited.append(line) for tag, value in unused_tags.items(): - missing_tag_message = ( - f"Unused tag {tag} on line(s): {str(value)}" - ) + missing_tag_message = f"Unused tag {tag} on line(s): {str(value)}" if make_fatal: raise SmartSimError(missing_tag_message) logger.warning(missing_tag_message) self.lines = edited + return used_params def _is_ensemble_spec( self, tagged_line: str, model_params: t.Dict[str, str] diff --git a/smartsim/_core/launcher/__init__.py b/smartsim/_core/launcher/__init__.py index d28cb2588..6e1aa724e 100644 --- a/smartsim/_core/launcher/__init__.py +++ b/smartsim/_core/launcher/__init__.py @@ -24,8 +24,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .launcher import Launcher from .cobalt.cobaltLauncher import CobaltLauncher +from .launcher import Launcher from .local.local import LocalLauncher from .lsf.lsfLauncher import LSFLauncher from .pbs.pbsLauncher import PBSLauncher diff --git a/smartsim/_core/launcher/cobalt/cobaltLauncher.py b/smartsim/_core/launcher/cobalt/cobaltLauncher.py index ca0b88a3b..56ebe12cc 100644 --- a/smartsim/_core/launcher/cobalt/cobaltLauncher.py +++ b/smartsim/_core/launcher/cobalt/cobaltLauncher.py @@ -31,13 +31,13 @@ from smartsim._core.launcher.step import Step from smartsim.settings import ( - RunSettings, - SettingsBase, AprunSettings, CobaltBatchSettings, - MpirunSettings, MpiexecSettings, + MpirunSettings, OrterunSettings, + RunSettings, + SettingsBase, ) from ....error import LauncherError @@ -47,13 +47,13 @@ from ..launcher import WLMLauncher from ..pbs.pbsCommands import qdel, qstat from ..step import ( - Step, AprunStep, CobaltBatchStep, LocalStep, MpiexecStep, MpirunStep, OrterunStep, + Step, ) from ..stepInfo import CobaltStepInfo, StepInfo from .cobaltParser import parse_cobalt_step_id, parse_cobalt_step_status, parse_qsub_out @@ -117,16 +117,13 @@ def run(self, step: Step) -> t.Optional[str]: # aprun doesn't direct output for us. out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) # if batch submission did not successfully retrieve job ID diff --git a/smartsim/_core/launcher/cobalt/cobaltParser.py b/smartsim/_core/launcher/cobalt/cobaltParser.py index ab205491e..c76509d36 100644 --- a/smartsim/_core/launcher/cobalt/cobaltParser.py +++ b/smartsim/_core/launcher/cobalt/cobaltParser.py @@ -78,7 +78,7 @@ def parse_qsub_out(output: str) -> str: for line in output.split("\n"): try: value = line.strip() - int(value) # if the cast works, return original string + int(value) # if the cast works, return original string step_id = value break except ValueError: diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index 84e269c1f..ea331023c 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -27,11 +27,10 @@ import sys import typing as t - +from ...entity.dbobject import DBModel, DBScript from ...error import SSInternalError from ..config import CONFIG from ..utils.helpers import create_lockfile_name -from ...entity.dbobject import DBModel, DBScript def write_colocated_launch_script( @@ -119,14 +118,14 @@ def _build_colocated_wrapper_cmd( # up the backgrounded db process cmd = [ - sys.executable, - "-m", - "smartsim._core.entrypoints.colocated", - "+lockfile", - lockfile, - "+db_cpus", - str(cpus), - ] + sys.executable, + "-m", + "smartsim._core.entrypoints.colocated", + "+lockfile", + lockfile, + "+db_cpus", + str(cpus), + ] # Add in the interface if using TCP/IP if ifname: if isinstance(ifname, str): @@ -137,16 +136,9 @@ def _build_colocated_wrapper_cmd( db_cmd = [] if custom_pinning: - db_cmd.extend([ - 'taskset', '-c', custom_pinning - ]) + db_cmd.extend(["taskset", "-c", custom_pinning]) db_cmd.extend( - [ - CONFIG.database_exe, - CONFIG.database_conf, - "--loadmodule", - CONFIG.redisai - ] + [CONFIG.database_exe, CONFIG.database_conf, "--loadmodule", CONFIG.redisai] ) # add extra redisAI configurations @@ -218,6 +210,7 @@ def _build_db_model_cmd(db_models: t.List[DBModel]) -> t.List[str]: cmd.append(f"--backend={db_model.backend}") cmd.append(f"--device={db_model.device}") cmd.append(f"--devices_per_node={db_model.devices_per_node}") + cmd.append(f"--first_device={db_model.first_device}") if db_model.batch_size: cmd.append(f"--batch_size={db_model.batch_size}") if db_model.min_batch_size: @@ -254,5 +247,5 @@ def _build_db_script_cmd(db_scripts: t.List[DBScript]) -> t.List[str]: cmd.append(f"--file={db_script.file}") cmd.append(f"--device={db_script.device}") cmd.append(f"--devices_per_node={db_script.devices_per_node}") - + cmd.append(f"--first_device={db_script.first_device}") return cmd diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index ec8bb0120..61f0460f9 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -28,11 +28,11 @@ import typing as t from ...error import AllocationError, LauncherError, SSUnsupportedError -from .stepInfo import UnmanagedStepInfo, StepInfo +from ...settings import SettingsBase +from .step import Step +from .stepInfo import StepInfo, UnmanagedStepInfo from .stepMapping import StepMapping from .taskManager import TaskManager -from .step import Step -from ...settings import SettingsBase class Launcher(abc.ABC): # pragma: no cover @@ -47,11 +47,6 @@ class Launcher(abc.ABC): # pragma: no cover step_mapping: StepMapping task_manager: TaskManager - @property - @abc.abstractmethod - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: - raise NotImplementedError - @abc.abstractmethod def create_step(self, name: str, cwd: str, step_settings: SettingsBase) -> Step: raise NotImplementedError @@ -86,6 +81,11 @@ def __init__(self) -> None: self.task_manager = TaskManager() self.step_mapping = StepMapping() + @property + @abc.abstractmethod + def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: + raise NotImplementedError + # every launcher utilizing this interface must have a map # of supported RunSettings types (see slurmLauncher.py for ex) def create_step( @@ -176,6 +176,6 @@ def _get_unmanaged_step_update( # pylint: disable-next=no-self-use def _get_managed_step_update( self, - step_ids: t.List[str], # pylint: disable=unused-argument + step_ids: t.List[str], # pylint: disable=unused-argument ) -> t.List[StepInfo]: # pragma: no cover return [] diff --git a/tests/test_configs/test_dir/test.py b/smartsim/_core/launcher/local/__init__.py similarity index 100% rename from tests/test_configs/test_dir/test.py rename to smartsim/_core/launcher/local/__init__.py diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py index 7e5c56f7b..fee058d16 100644 --- a/smartsim/_core/launcher/local/local.py +++ b/smartsim/_core/launcher/local/local.py @@ -26,27 +26,17 @@ import typing as t -from ..launcher import Launcher -from ....log import get_logger from ....settings import RunSettings, SettingsBase -from ..step import LocalStep -from ..step import Step -from ..stepInfo import UnmanagedStepInfo, StepInfo +from ..launcher import Launcher +from ..step import LocalStep, Step +from ..stepInfo import StepInfo, UnmanagedStepInfo from ..stepMapping import StepMapping from ..taskManager import TaskManager -logger = get_logger(__name__) - class LocalLauncher(Launcher): """Launcher used for spawning proceses on a localhost machine.""" - @property - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: - return { - RunSettings: LocalStep, - } - def __init__(self) -> None: self.task_manager = TaskManager() self.step_mapping = StepMapping() @@ -58,26 +48,28 @@ def create_step(self, name: str, cwd: str, step_settings: SettingsBase) -> Step: """ if not isinstance(step_settings, RunSettings): raise TypeError( - f"Local Launcher only supports entities with RunSettings, not {type(step_settings)}" + "Local Launcher only supports entities with RunSettings, " + f"not {type(step_settings)}" ) - step = LocalStep(name, cwd, step_settings) - return step + return LocalStep(name, cwd, step_settings) - def get_step_update(self, step_names: t.List[str]) -> t.List[t.Tuple[str, t.Optional[StepInfo]]]: + def get_step_update( + self, step_names: t.List[str] + ) -> t.List[t.Tuple[str, t.Optional[StepInfo]]]: """Get status updates of each job step name provided :param step_names: list of step_names :type step_names: list[str] :return: list of tuples for update - :rtype: list[(str, UnmanagedStepInfo)] + :rtype: list[tuple[str, StepInfo | None]] """ # step ids are process ids of the tasks # as there is no WLM intermediary updates: t.List[t.Tuple[str, t.Optional[StepInfo]]] = [] s_names, s_ids = self.step_mapping.get_ids(step_names, managed=False) for step_name, step_id in zip(s_names, s_ids): - status, rc, out, err = self.task_manager.get_task_update(str(step_id)) - step_info = UnmanagedStepInfo(status, rc, out, err) + status, ret_code, out, err = self.task_manager.get_task_update(str(step_id)) + step_info = UnmanagedStepInfo(status, ret_code, out, err) update = (step_name, step_info) updates.append(update) return updates @@ -85,8 +77,12 @@ def get_step_update(self, step_names: t.List[str]) -> t.List[t.Tuple[str, t.Opti def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: """Return the address of nodes assigned to the step + :param step_names: list of step_names + :type step_names: list[str] + :return: list of node addresses + :rtype: list[list[str]] + TODO: Use socket to find the actual Lo address? - :return: a list containing the local host address """ return [["127.0.0.1"] * len(step_names)] @@ -104,16 +100,17 @@ def run(self, step: Step) -> str: self.task_manager.start() out, err = step.get_output_files() - output = open(out, "w+") - error = open(err, "w+") cmd = step.get_launch_cmd() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None + # pylint: disable-next=consider-using-with + output = open(out, "w+", encoding="utf-8") + # pylint: disable-next=consider-using-with + error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd, step.cwd, env=passed_env, out=output.fileno(), err=error.fileno() + cmd, step.cwd, env=step.env, out=output.fileno(), err=error.fileno() ) + self.step_mapping.add(step.name, task_id=task_id, managed=False) return task_id @@ -127,10 +124,10 @@ def stop(self, step_name: str) -> UnmanagedStepInfo: """ # step_id is task_id for local. Naming for consistency step_id = self.step_mapping[step_name].task_id - + self.task_manager.remove_task(str(step_id)) - _, rc, out, err = self.task_manager.get_task_update(str(step_id)) - step_info = UnmanagedStepInfo("Cancelled", rc, out, err) + _, ret_code, out, err = self.task_manager.get_task_update(str(step_id)) + step_info = UnmanagedStepInfo("Cancelled", ret_code, out, err) return step_info def __str__(self) -> str: diff --git a/smartsim/_core/launcher/lsf/lsfLauncher.py b/smartsim/_core/launcher/lsf/lsfLauncher.py index a8d0e27aa..c2f432807 100644 --- a/smartsim/_core/launcher/lsf/lsfLauncher.py +++ b/smartsim/_core/launcher/lsf/lsfLauncher.py @@ -30,25 +30,25 @@ from ....error import LauncherError from ....log import get_logger from ....settings import ( - SettingsBase, BsubBatchSettings, JsrunSettings, MpiexecSettings, MpirunSettings, OrterunSettings, RunSettings, + SettingsBase, ) from ....status import STATUS_CANCELLED, STATUS_COMPLETED from ...config import CONFIG from ..launcher import WLMLauncher from ..step import ( - Step, BsubBatchStep, JsrunStep, LocalStep, MpiexecStep, MpirunStep, OrterunStep, + Step, ) from ..stepInfo import LSFBatchStepInfo, LSFJsrunStepInfo, StepInfo from .lsfCommands import bjobs, bkill, jskill, jslist @@ -115,19 +115,16 @@ def run(self, step: Step) -> t.Optional[str]: time.sleep(1) step_id = self._get_lsf_step_id(step) logger.debug(f"Gleaned jsrun step id: {step_id} for {step.name}") - else: # isinstance(step, MpirunStep) or isinstance(step, LocalStep) + else: # mpirun and local launch don't direct output for us out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) self.step_mapping.add(step.name, step_id, task_id, step.managed) diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index cbb85337c..1b77ffd81 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -30,26 +30,26 @@ from ....error import LauncherError from ....log import get_logger from ....settings import ( - SettingsBase, AprunSettings, - QsubBatchSettings, MpiexecSettings, MpirunSettings, OrterunSettings, - RunSettings, PalsMpiexecSettings, + QsubBatchSettings, + RunSettings, + SettingsBase, ) from ....status import STATUS_CANCELLED, STATUS_COMPLETED from ...config import CONFIG from ..launcher import WLMLauncher from ..step import ( - Step, AprunStep, LocalStep, MpiexecStep, MpirunStep, OrterunStep, QsubBatchStep, + Step, ) from ..stepInfo import PBSStepInfo, StepInfo from .pbsCommands import qdel, qstat @@ -111,15 +111,12 @@ def run(self, step: Step) -> t.Optional[str]: # aprun/local doesn't direct output for us. out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) # if batch submission did not successfully retrieve job ID diff --git a/smartsim/_core/launcher/slurm/slurmCommands.py b/smartsim/_core/launcher/slurm/slurmCommands.py index e28bf482f..ecf545b91 100644 --- a/smartsim/_core/launcher/slurm/slurmCommands.py +++ b/smartsim/_core/launcher/slurm/slurmCommands.py @@ -27,76 +27,69 @@ import typing as t from ....error import LauncherError +from ....log import get_logger from ...utils.helpers import expand_exe_path from ..util.shell import execute_cmd +logger = get_logger(__name__) -def sstat(args: t.List[str]) -> t.Tuple[str, str]: + +def sstat(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]: """Calls sstat with args :param args: List of command arguments :type args: List of str :returns: Output and error of sstat """ - _sstat = _find_slurm_command("sstat") - cmd = [_sstat] + args - _, out, error = execute_cmd(cmd) - return out, error + _, out, err = _execute_slurm_cmd("sstat", args, raise_on_err=raise_on_err) + return out, err -def sacct(args: t.List[str]) -> t.Tuple[str, str]: +def sacct(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]: """Calls sacct with args :param args: List of command arguments :type args: List of str :returns: Output and error of sacct """ - _sacct = _find_slurm_command("sacct") - cmd = [_sacct] + args - _, out, error = execute_cmd(cmd) - return out, error + _, out, err = _execute_slurm_cmd("sacct", args, raise_on_err=raise_on_err) + return out, err -def salloc(args: t.List[str]) -> t.Tuple[str, str]: +def salloc(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]: """Calls slurm salloc with args :param args: List of command arguments :type args: List of str :returns: Output and error of salloc """ - _salloc = _find_slurm_command("salloc") - cmd = [_salloc] + args - _, out, error = execute_cmd(cmd) - return out, error + _, out, err = _execute_slurm_cmd("salloc", args, raise_on_err=raise_on_err) + return out, err -def sinfo(args: t.List[str]) -> t.Tuple[str, str]: +def sinfo(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]: """Calls slurm sinfo with args :param args: List of command arguments :type args: List of str :returns: Output and error of sinfo """ - _sinfo = _find_slurm_command("sinfo") - cmd = [_sinfo] + args - _, out, error = execute_cmd(cmd) - return out, error + _, out, err = _execute_slurm_cmd("sinfo", args, raise_on_err=raise_on_err) + return out, err -def scontrol(args: t.List[str]) -> t.Tuple[str, str]: +def scontrol(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]: """Calls slurm scontrol with args :param args: List of command arguments :type args: List of str :returns: Output and error of sinfo """ - _scontrol = _find_slurm_command("scontrol") - cmd = [_scontrol] + args - _, out, error = execute_cmd(cmd) - return out, error + _, out, err = _execute_slurm_cmd("scontrol", args, raise_on_err=raise_on_err) + return out, err -def scancel(args: t.List[str]) -> t.Tuple[int, str, str]: +def scancel(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[int, str, str]: """Calls slurm scancel with args. returncode is also supplied in this function. @@ -106,10 +99,7 @@ def scancel(args: t.List[str]) -> t.Tuple[int, str, str]: :return: output and error :rtype: str """ - _scancel = _find_slurm_command("scancel") - cmd = [_scancel] + args - returncode, out, error = execute_cmd(cmd) - return returncode, out, error + return _execute_slurm_cmd("scancel", args, raise_on_err=raise_on_err) def _find_slurm_command(cmd: str) -> str: @@ -120,3 +110,17 @@ def _find_slurm_command(cmd: str) -> str: raise LauncherError( f"Slurm Launcher could not find path of {cmd} command" ) from e + + +def _execute_slurm_cmd( + command: str, args: t.List[str], raise_on_err: bool = False +) -> t.Tuple[int, str, str]: + cmd_exe = _find_slurm_command(command) + cmd = [cmd_exe] + args + returncode, out, error = execute_cmd(cmd) + if returncode != 0: + msg = f"An error occurred while calling {command}: {error}" + if raise_on_err: + raise LauncherError(msg) + logger.error(msg) + return returncode, out, error diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index 70bdab5a2..cba8df4f1 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -107,10 +107,7 @@ def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: """ _, step_ids = self.step_mapping.get_ids(step_names, managed=True) step_str = _create_step_id_str([val for val in step_ids if val is not None]) - output, error = sstat([step_str, "-i", "-n", "-p", "-a"]) - - if "error:" in error.split(" "): - raise LauncherError("Failed to retrieve nodelist from stat") + output, _ = sstat([step_str, "-i", "-n", "-p", "-a"], raise_on_err=True) # parse node list for each step node_lists = [] @@ -155,15 +152,12 @@ def run(self, step: Step) -> t.Optional[str]: # MPI/local steps don't direct output like slurm steps out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) if not step_id and step.managed: @@ -243,9 +237,9 @@ def _get_slurm_step_id(step: Step, interval: int = 2) -> str: step_id: t.Optional[str] = None trials = CONFIG.wlm_trials while trials > 0: - output, err = sacct(["--noheader", "-p", "--format=jobname,jobid"]) - if err: - logger.warning(f"An error occurred while calling sacct: {err}") + output, _ = sacct( + ["--noheader", "-p", "--format=jobname,jobid"], raise_on_err=True + ) step_id = parse_step_id_from_sacct(output, step.name) if step_id: @@ -266,7 +260,10 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: :rtype: list[StepInfo] """ step_str = _create_step_id_str(step_ids) - sacct_out, _ = sacct(["--noheader", "-p", "-b", "--jobs", step_str]) + sacct_out, _ = sacct( + ["--noheader", "-p", "-b", "--jobs", step_str], raise_on_err=True + ) + # (status, returncode) stat_tuples = [parse_sacct(sacct_out, step_id) for step_id in step_ids] diff --git a/smartsim/_core/launcher/step/__init__.py b/smartsim/_core/launcher/step/__init__.py index 5a36701a8..98dd1a921 100644 --- a/smartsim/_core/launcher/step/__init__.py +++ b/smartsim/_core/launcher/step/__init__.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .step import Step from .alpsStep import AprunStep from .cobaltStep import CobaltBatchStep from .localStep import LocalStep @@ -32,3 +31,4 @@ from .mpiStep import MpiexecStep, MpirunStep, OrterunStep from .pbsStep import QsubBatchStep from .slurmStep import SbatchStep, SrunStep +from .step import Step diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index 80e7e7658..d675f703f 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -31,8 +31,8 @@ from ....error import AllocationError from ....log import get_logger -from .step import Step from ....settings import AprunSettings, RunSettings, Singularity +from .step import Step, proxyable_launch_cmd logger = get_logger(__name__) @@ -56,9 +56,11 @@ def __init__(self, name: str, cwd: str, run_settings: AprunSettings) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ return self.run_settings.mpmd + @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step diff --git a/smartsim/_core/launcher/step/localStep.py b/smartsim/_core/launcher/step/localStep.py index d15a48381..2f10bc79d 100644 --- a/smartsim/_core/launcher/step/localStep.py +++ b/smartsim/_core/launcher/step/localStep.py @@ -28,17 +28,22 @@ import shutil import typing as t -from .step import Step -from ....settings.base import RunSettings from ....settings import Singularity +from ....settings.base import RunSettings +from .step import Step, proxyable_launch_cmd class LocalStep(Step): def __init__(self, name: str, cwd: str, run_settings: RunSettings): super().__init__(name, cwd, run_settings) self.run_settings = run_settings - self.env = self._set_env() + self._env = self._set_env() + + @property + def env(self) -> t.Dict[str, str]: + return self._env + @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: cmd = [] diff --git a/smartsim/_core/launcher/step/lsfStep.py b/smartsim/_core/launcher/step/lsfStep.py index ae6c3525b..953ab9c45 100644 --- a/smartsim/_core/launcher/step/lsfStep.py +++ b/smartsim/_core/launcher/step/lsfStep.py @@ -30,9 +30,9 @@ from ....error import AllocationError from ....log import get_logger -from .step import Step from ....settings import BsubBatchSettings, JsrunSettings from ....settings.base import RunSettings +from .step import Step logger = get_logger(__name__) @@ -213,7 +213,8 @@ def _set_alloc(self) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ if isinstance(self.step_settings, JsrunSettings): return self.step_settings.mpmd return [] diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 9a0796c0f..7971fb732 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -26,14 +26,14 @@ import os import shutil -from shlex import split as sh_split import typing as t +from shlex import split as sh_split from ....error import AllocationError, SmartSimError from ....log import get_logger -from .step import Step -from ....settings import MpirunSettings, MpiexecSettings, OrterunSettings +from ....settings import MpiexecSettings, MpirunSettings, OrterunSettings from ....settings.base import RunSettings +from .step import Step, proxyable_launch_cmd logger = get_logger(__name__) @@ -59,6 +59,7 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings) -> None: _supported_launchers = ["PBS", "COBALT", "SLURM", "LSB"] + @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step @@ -118,7 +119,8 @@ def _set_alloc(self) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ if hasattr(self.run_settings, "mpmd") and self.run_settings.mpmd: rs_mpmd: t.List[RunSettings] = self.run_settings.mpmd return rs_mpmd diff --git a/smartsim/_core/launcher/step/pbsStep.py b/smartsim/_core/launcher/step/pbsStep.py index 7a2f47a69..9218894f9 100644 --- a/smartsim/_core/launcher/step/pbsStep.py +++ b/smartsim/_core/launcher/step/pbsStep.py @@ -27,8 +27,8 @@ import typing as t from ....log import get_logger -from .step import Step from ....settings import QsubBatchSettings +from .step import Step logger = get_logger(__name__) diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index 18575e4e9..cb0db483b 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -26,13 +26,13 @@ import os import shutil -from shlex import split as sh_split import typing as t +from shlex import split as sh_split from ....error import AllocationError from ....log import get_logger +from ....settings import RunSettings, SbatchSettings, Singularity, SrunSettings from .step import Step -from ....settings import SrunSettings, SbatchSettings, RunSettings, Singularity logger = get_logger(__name__) @@ -189,13 +189,15 @@ def _set_alloc(self) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ return self.run_settings.mpmd @staticmethod def _get_exe_args_list(run_setting: RunSettings) -> t.List[str]: """Convenience function to encapsulate checking the - runsettings.exe_args type to always return a list""" + runsettings.exe_args type to always return a list + """ exe_args = run_setting.exe_args args: t.List[str] = exe_args if isinstance(exe_args, list) else [exe_args] return args diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index b3c9a808f..ebbdd074e 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -26,16 +26,20 @@ from __future__ import annotations +import functools import os.path as osp +import sys import time import typing as t +from os import makedirs -from smartsim.error.errors import SmartSimError +from smartsim._core.config import CONFIG +from smartsim.error.errors import SmartSimError, UnproxyableStepError from ....log import get_logger -from ...utils.helpers import get_base_36_repr +from ....settings.base import RunSettings, SettingsBase +from ...utils.helpers import encode_cmd, get_base_36_repr from ..colocated import write_colocated_launch_script -from ....settings.base import SettingsBase, RunSettings logger = get_logger(__name__) @@ -47,6 +51,12 @@ def __init__(self, name: str, cwd: str, step_settings: SettingsBase) -> None: self.cwd = cwd self.managed = False self.step_settings = step_settings + self.meta: t.Dict[str, str] = {} + + @property + def env(self) -> t.Optional[t.Dict[str, str]]: + """Overridable, read only property for step to specify its environment""" + return None def get_launch_cmd(self) -> t.List[str]: raise NotImplementedError @@ -67,7 +77,8 @@ def get_step_file( ) -> str: """Get the name for a file/script created by the step class - Used for Batch scripts, mpmd scripts, etc""" + Used for Batch scripts, mpmd scripts, etc. + """ if script_name: script_name = script_name if "." in script_name else script_name + ending return osp.join(self.cwd, script_name) @@ -75,7 +86,12 @@ def get_step_file( def get_colocated_launch_script(self) -> str: # prep step for colocated launch if specifed in run settings - script_path = self.get_step_file(script_name=".colocated_launcher.sh") + script_path = self.get_step_file( + script_name=osp.join( + ".smartsim", f"colocated_launcher_{self.entity_name}.sh" + ) + ) + makedirs(osp.dirname(script_path), exist_ok=True) db_settings: t.Dict[str, str] = {} if isinstance(self.step_settings, RunSettings): @@ -101,3 +117,49 @@ def add_to_batch(self, step: Step) -> None: :type step: Step """ raise SmartSimError("add_to_batch not implemented for this step type") + + +_StepT = t.TypeVar("_StepT", bound=Step) + + +def proxyable_launch_cmd( + fn: t.Callable[[_StepT], t.List[str]], / +) -> t.Callable[[_StepT], t.List[str]]: + @functools.wraps(fn) + def _get_launch_cmd(self: _StepT) -> t.List[str]: + original_cmd_list = fn(self) + + if not CONFIG.telemetry_enabled: + return original_cmd_list + + if self.managed: + raise UnproxyableStepError( + f"Attempting to proxy managed step of type {type(self)}" + "through the unmanaged step proxy entry point" + ) + + proxy_module = "smartsim._core.entrypoints.indirect" + etype = self.meta["entity_type"] + status_dir = self.meta["status_dir"] + encoded_cmd = encode_cmd(original_cmd_list) + + # NOTE: this is NOT safe. should either 1) sign cmd and verify OR 2) + # serialize step and let the indirect entrypoint rebuild the + # cmd... for now, test away... + return [ + sys.executable, + "-m", + proxy_module, + "+name", + self.name, + "+command", + encoded_cmd, + "+entity_type", + etype, + "+telemetry_dir", + status_dir, + "+working_dir", + self.cwd, + ] + + return _get_launch_cmd diff --git a/smartsim/_core/launcher/stepInfo.py b/smartsim/_core/launcher/stepInfo.py index 0f46f1ab0..b33dac5ec 100644 --- a/smartsim/_core/launcher/stepInfo.py +++ b/smartsim/_core/launcher/stepInfo.py @@ -24,9 +24,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import psutil import typing as t +import psutil + from ...status import ( SMARTSIM_STATUS, STATUS_CANCELLED, diff --git a/smartsim/_core/launcher/stepMapping.py b/smartsim/_core/launcher/stepMapping.py index 9c5e2bec4..665404b1b 100644 --- a/smartsim/_core/launcher/stepMapping.py +++ b/smartsim/_core/launcher/stepMapping.py @@ -32,10 +32,12 @@ class StepMap: - def __init__(self, - step_id: t.Optional[str] = None, - task_id: t.Optional[str] = None, - managed: t.Optional[bool] = None) -> None: + def __init__( + self, + step_id: t.Optional[str] = None, + task_id: t.Optional[str] = None, + managed: t.Optional[bool] = None, + ) -> None: self.step_id = step_id self.task_id = task_id self.managed = managed diff --git a/smartsim/_core/launcher/taskManager.py b/smartsim/_core/launcher/taskManager.py index d244db304..2ad84493f 100644 --- a/smartsim/_core/launcher/taskManager.py +++ b/smartsim/_core/launcher/taskManager.py @@ -26,13 +26,13 @@ from __future__ import annotations -import psutil import time import typing as t - from subprocess import PIPE from threading import RLock, Thread +import psutil + from ...error import LauncherError from ...log import get_logger from ..utils.helpers import check_dev_log_level diff --git a/smartsim/_core/launcher/util/shell.py b/smartsim/_core/launcher/util/shell.py index 9961042a6..1fc243c15 100644 --- a/smartsim/_core/launcher/util/shell.py +++ b/smartsim/_core/launcher/util/shell.py @@ -24,11 +24,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import psutil import time import typing as t from subprocess import PIPE, TimeoutExpired +import psutil + from ....error import ShellError from ....log import get_logger from ...utils.helpers import check_dev_log_level diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index 1bac9cb20..8d7edf722 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -27,30 +27,59 @@ """ A file of helper functions for SmartSim """ +import base64 import os -import uuid import typing as t +import uuid +from datetime import datetime from functools import lru_cache from pathlib import Path from shutil import which from smartsim._core._install.builder import TRedisAIBackendStr as _TRedisAIBackendStr + +def unpack_db_identifier(db_id: str, token: str) -> t.Tuple[str, str]: + """Unpack the unformatted database identifier + and format for env variable suffix using the token + :param db_id: the unformatted database identifier eg. identifier_1 + :type db_id: str + :param token: character to use to construct the db suffix + :type token: str + :return: db id suffix and formatted db_id e.g. ("_identifier_1", "identifier_1") + :rtype: (str, str) + """ + + if db_id == "orchestrator": + return "", "" + db_name_suffix = token + db_id + return db_name_suffix, db_id + + +def unpack_colo_db_identifier(db_id: str) -> str: + """Create database identifier suffix for colocated database + :param db_id: the unformatted database identifier + :type db_id: str + :return: db suffix + :rtype: str + """ + return "_" + db_id if db_id else "" + + +def create_short_id_str() -> str: + return str(uuid.uuid4())[:7] + + def create_lockfile_name() -> str: """Generate a unique lock filename using UUID""" - lock_suffix = str(uuid.uuid4())[:7] + lock_suffix = create_short_id_str() return f"smartsim-{lock_suffix}.lock" @lru_cache(maxsize=20, typed=False) def check_dev_log_level() -> bool: - try: - lvl = os.environ["SMARTSIM_LOG_LEVEL"] - if lvl == "developer": - return True - return False - except KeyError: - return False + lvl = os.environ.get("SMARTSIM_LOG_LEVEL", "") + return lvl == "developer" def fmt_dict(value: t.Dict[str, t.Any]) -> str: @@ -245,3 +274,31 @@ def installed_redisai_backends( } return {backend for backend in backends if _installed(base_path, backend)} + + +def get_ts() -> int: + """Return the current timestamp (accurate to seconds) cast to an integer""" + return int(datetime.timestamp(datetime.now())) + + +def encode_cmd(cmd: t.List[str]) -> str: + """Transform a standard command list into an encoded string safe for providing as an + argument to a proxy entrypoint + """ + if not cmd: + raise ValueError("Invalid cmd supplied") + + ascii_cmd = "|".join(cmd).encode("ascii") + encoded_cmd = base64.b64encode(ascii_cmd).decode("ascii") + return encoded_cmd + + +def decode_cmd(encoded_cmd: str) -> t.List[str]: + """Decode an encoded command string to the original command list format""" + if not encoded_cmd.strip(): + raise ValueError("Invalid cmd supplied") + + decoded_cmd = base64.b64decode(encoded_cmd.encode("ascii")) + cleaned_cmd = decoded_cmd.decode("ascii").split("|") + + return cleaned_cmd diff --git a/smartsim/_core/utils/network.py b/smartsim/_core/utils/network.py index ed5aed9c9..f18be208e 100644 --- a/smartsim/_core/utils/network.py +++ b/smartsim/_core/utils/network.py @@ -58,7 +58,6 @@ def get_ip_from_interface(interface: str) -> str: # pragma: no cover """ net_if_addrs = psutil.net_if_addrs() if interface not in net_if_addrs: - available = list(net_if_addrs.keys()) raise ValueError( f"{interface} is not a valid network interface. " diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 9645a367e..6c592d0f3 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -25,12 +25,12 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import logging -import redis import time import typing as t - from itertools import product -from redis.cluster import RedisCluster, ClusterNode + +import redis +from redis.cluster import ClusterNode, RedisCluster from redis.exceptions import ClusterDownError, RedisClusterException from smartredis import Client from smartredis.error import RedisReplyError @@ -171,6 +171,7 @@ def set_ml_model(db_model: DBModel, client: Client) -> None: device=device, batch_size=db_model.batch_size, min_batch_size=db_model.min_batch_size, + min_batch_timeout=db_model.min_batch_timeout, tag=db_model.tag, inputs=db_model.inputs, outputs=db_model.outputs, @@ -183,6 +184,7 @@ def set_ml_model(db_model: DBModel, client: Client) -> None: device=device, batch_size=db_model.batch_size, min_batch_size=db_model.min_batch_size, + min_batch_timeout=db_model.min_batch_timeout, tag=db_model.tag, inputs=db_model.inputs, outputs=db_model.outputs, @@ -214,3 +216,30 @@ def set_script(db_script: DBScript, client: Client) -> None: except RedisReplyError as error: # pragma: no cover logger.error("Error while setting model on orchestrator.") raise error + + +def shutdown_db_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov-wlm + """Send shutdown signal to DB node. + + Should only be used in the case where cluster deallocation + needs to occur manually. Usually, the SmartSim job manager + will take care of this automatically. + + :param host_ip: IP of host to connect to + :type hosts: str + :param ports: Port to which node is listening + :type ports: int + :return: returncode, output, and error of the process + :rtype: tuple of (int, str, str) + """ + redis_cli = CONFIG.database_cli + cmd = [redis_cli, "-h", host_ip, "-p", str(port), "shutdown"] + returncode, out, err = execute_cmd(cmd, proc_input="yes", shell=False, timeout=10) + + if returncode != 0: + logger.error(out) + logger.error(err) + elif out: + logger.debug(out) + + return returncode, out, err diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py new file mode 100644 index 000000000..75f9aef66 --- /dev/null +++ b/smartsim/_core/utils/serialize.py @@ -0,0 +1,255 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import json +import time +import typing as t +from pathlib import Path + +import smartsim._core._cli.utils as _utils +import smartsim.log +from smartsim._core.config import CONFIG + +if t.TYPE_CHECKING: + from smartsim import Experiment + from smartsim._core.control.manifest import LaunchedManifest as _Manifest + from smartsim.database.orchestrator import Orchestrator + from smartsim.entity import DBNode, Ensemble, Model + from smartsim.entity.dbobject import DBModel, DBScript + from smartsim.settings.base import BatchSettings, RunSettings + + +TStepLaunchMetaData = t.Tuple[ + t.Optional[str], t.Optional[str], t.Optional[bool], str, str, Path +] +TELMON_SUBDIR: t.Final[str] = ".smartsim/telemetry" +MANIFEST_FILENAME: t.Final[str] = "manifest.json" + +_LOGGER = smartsim.log.get_logger(__name__) + + +def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: + if not CONFIG.telemetry_enabled: + return + + manifest.metadata.run_telemetry_subdirectory.mkdir(parents=True, exist_ok=True) + + new_run = { + "run_id": manifest.metadata.run_id, + "timestamp": int(time.time_ns()), + "model": [ + _dictify_model(model, *telemetry_metadata) + for model, telemetry_metadata in manifest.models + ], + "orchestrator": [ + _dictify_db(db, nodes_info) for db, nodes_info in manifest.databases + ], + "ensemble": [ + _dictify_ensemble(ens, member_info) + for ens, member_info in manifest.ensembles + ], + } + try: + with open(manifest.metadata.manifest_file_path, "r", encoding="utf-8") as file: + manifest_dict = json.load(file) + except (FileNotFoundError, json.JSONDecodeError): + manifest_dict = { + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.2", + }, + "experiment": { + "name": manifest.metadata.exp_name, + "path": manifest.metadata.exp_path, + "launcher": manifest.metadata.launcher_name, + }, + "runs": [new_run], + } + else: + manifest_dict["runs"].append(new_run) + finally: + with open(manifest.metadata.manifest_file_path, "w", encoding="utf-8") as file: + json.dump(manifest_dict, file, indent=2) + + +def _dictify_model( + model: Model, + step_id: t.Optional[str], + task_id: t.Optional[str], + managed: t.Optional[bool], + out_file: str, + err_file: str, + telemetry_data_path: Path, +) -> t.Dict[str, t.Any]: + colo_settings = (model.run_settings.colocated_db_settings or {}).copy() + db_scripts = t.cast("t.List[DBScript]", colo_settings.pop("db_scripts", [])) + db_models = t.cast("t.List[DBModel]", colo_settings.pop("db_models", [])) + return { + "name": model.name, + "path": model.path, + "exe_args": model.run_settings.exe_args, + "run_settings": _dictify_run_settings(model.run_settings), + "batch_settings": ( + _dictify_batch_settings(model.batch_settings) + if model.batch_settings + else {} + ), + "params": model.params, + "files": ( + { + "Symlink": model.files.link, + "Configure": model.files.tagged, + "Copy": model.files.copy, + } + if model.files + else { + "Symlink": [], + "Configure": [], + "Copy": [], + } + ), + "colocated_db": ( + { + "settings": colo_settings, + "scripts": [ + { + script.name: { + "backend": "TORCH", + "device": script.device, + } + } + for script in db_scripts + ], + "models": [ + { + model.name: { + "backend": model.backend, + "device": model.device, + } + } + for model in db_models + ], + } + if colo_settings + else {} + ), + "telemetry_metadata": { + "status_dir": str(telemetry_data_path), + "step_id": step_id, + "task_id": task_id, + "managed": managed, + }, + "out_file": out_file, + "err_file": err_file, + } + + +def _dictify_ensemble( + ens: Ensemble, + members: t.Sequence[t.Tuple[Model, TStepLaunchMetaData]], +) -> t.Dict[str, t.Any]: + return { + "name": ens.name, + "params": ens.params, + "batch_settings": ( + _dictify_batch_settings(ens.batch_settings) + # FIXME: Typehint here is wrong, ``ens.batch_settings`` can + # also be an empty dict for no discernible reason... + if ens.batch_settings + else {} + ), + "models": [ + _dictify_model(model, *launching_metadata) + for model, launching_metadata in members + ], + } + + +def _dictify_run_settings(run_settings: RunSettings) -> t.Dict[str, t.Any]: + # TODO: remove this downcast + if hasattr(run_settings, "mpmd") and run_settings.mpmd: + _LOGGER.warning( + "SmartSim currently cannot properly serialize all information in " + "MPMD run settings" + ) + return { + "exe": run_settings.exe, + # TODO: We should try to move this back + # "exe_args": run_settings.exe_args, + "run_command": run_settings.run_command, + "run_args": run_settings.run_args, + # TODO: We currently do not have a way to represent MPMD commands! + # Maybe add a ``"mpmd"`` key here that is a + # ``list[TDictifiedRunSettings]``? + } + + +def _dictify_batch_settings(batch_settings: BatchSettings) -> t.Dict[str, t.Any]: + return { + "batch_command": batch_settings.batch_cmd, + "batch_args": batch_settings.batch_args, + } + + +def _dictify_db( + db: Orchestrator, + nodes: t.Sequence[t.Tuple[DBNode, TStepLaunchMetaData]], +) -> t.Dict[str, t.Any]: + db_path = _utils.get_db_path() + if db_path: + db_type, _ = db_path.name.split("-", 1) + else: + db_type = "Unknown" + return { + "name": db.name, + "type": db_type, + "interface": db._interfaces, # pylint: disable=protected-access + "shards": [ + { + **shard.to_dict(), + "conf_file": shard.cluster_conf_file, + "out_file": out_file, + "err_file": err_file, + "telemetry_metadata": { + "status_dir": str(status_dir), + "step_id": step_id, + "task_id": task_id, + "managed": managed, + }, + } + for dbnode, ( + step_id, + task_id, + managed, + out_file, + err_file, + status_dir, + ) in nodes + for shard in dbnode.get_launched_shard_info() + ], + } diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 6be4f09a9..07a1a1bfd 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -24,24 +24,23 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import itertools -import psutil import sys import typing as t - -from os import getcwd, getenv +from os import environ, getcwd, getenv from shlex import split as sh_split -from smartredis import Client +import psutil +from smartredis import Client, ConfigOptions from smartredis.error import RedisReplyError from .._core.config import CONFIG from .._core.utils import db_is_active -from .._core.utils.helpers import is_valid_cmd +from .._core.utils.helpers import is_valid_cmd, unpack_db_identifier from .._core.utils.network import get_ip_from_host from ..entity import DBNode, EntityList from ..error import SmartSimError, SSConfigError, SSUnsupportedError from ..log import get_logger -from ..settings.base import BatchSettings, RunSettings +from ..servertype import CLUSTERED, STANDALONE from ..settings import ( AprunSettings, BsubBatchSettings, @@ -55,12 +54,12 @@ SbatchSettings, SrunSettings, ) +from ..settings.base import BatchSettings, RunSettings from ..settings.settings import create_batch_settings, create_run_settings from ..wlm import detect_launcher logger = get_logger(__name__) - by_launcher: t.Dict[str, t.List[str]] = { "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], @@ -141,7 +140,7 @@ def _check_local_constraints(launcher: str, batch: bool) -> None: raise SmartSimError(msg) -class Orchestrator(EntityList): +class Orchestrator(EntityList[DBNode]): """The Orchestrator is an in-memory database that can be launched alongside entities in SmartSim. Data can be transferred between entities by using one of the Python, C, C++ or Fortran clients @@ -156,11 +155,16 @@ def __init__( run_command: str = "auto", db_nodes: int = 1, batch: bool = False, - hosts: t.Optional[t.List[str]] = None, + hosts: t.Optional[t.Union[t.List[str], str]] = None, account: t.Optional[str] = None, time: t.Optional[str] = None, alloc: t.Optional[str] = None, single_cmd: bool = False, + *, + threads_per_queue: t.Optional[int] = None, + inter_op_threads: t.Optional[int] = None, + intra_op_threads: t.Optional[int] = None, + db_identifier: str = "orchestrator", **kwargs: t.Any, ) -> None: """Initialize an Orchestrator reference for local launch @@ -181,46 +185,43 @@ def __init__( :param intra_op_threads: threads per CPU operation :type intra_op_threads: int, optional """ - launcher, run_command = _autodetect(launcher, run_command) - - _check_run_command(launcher, run_command) - _check_local_constraints(launcher, batch) - - single_cmd = _get_single_command(run_command, batch, single_cmd) - - self.launcher = launcher - self.run_command = run_command - + self.launcher, self.run_command = _autodetect(launcher, run_command) + _check_run_command(self.launcher, self.run_command) + _check_local_constraints(self.launcher, batch) + single_cmd = _get_single_command(self.run_command, batch, single_cmd) self.ports: t.List[int] = [] - self.path = getcwd() self._hosts: t.List[str] = [] + self._user_hostlist: t.List[str] = [] if isinstance(interface, str): interface = [interface] self._interfaces = interface self._check_network_interface() - self.queue_threads = kwargs.get("threads_per_queue", None) - self.inter_threads = kwargs.get("inter_op_threads", None) - self.intra_threads = kwargs.get("intra_op_threads", None) + self.queue_threads = threads_per_queue + self.inter_threads = inter_op_threads + self.intra_threads = intra_op_threads + + gpus_per_shard: t.Optional[int] = None + cpus_per_shard: t.Optional[int] = None if self.launcher == "lsf": - gpus_per_shard = kwargs.pop("gpus_per_shard", 0) - cpus_per_shard = kwargs.pop("cpus_per_shard", 4) - else: - gpus_per_shard = None - cpus_per_shard = None + gpus_per_shard = int(kwargs.pop("gpus_per_shard", 0)) + cpus_per_shard = int(kwargs.pop("cpus_per_shard", 4)) super().__init__( - "orchestrator", - self.path, + name=db_identifier, + path=getcwd(), port=port, interface=interface, db_nodes=db_nodes, batch=batch, - launcher=launcher, - run_command=run_command, + launcher=self.launcher, + run_command=self.run_command, alloc=alloc, single_cmd=single_cmd, gpus_per_shard=gpus_per_shard, cpus_per_shard=cpus_per_shard, + threads_per_queue=threads_per_queue, + inter_op_threads=inter_op_threads, + intra_op_threads=intra_op_threads, **kwargs, ) @@ -234,25 +235,26 @@ def __init__( self._redis_conf # pylint: disable=W0104 CONFIG.database_cli # pylint: disable=W0104 except SSConfigError as e: - msg = "SmartSim not installed with pre-built extensions (Redis)\n" - msg += "Use the `smart` cli tool to install needed extensions\n" - msg += "or set REDIS_PATH and REDIS_CLI_PATH in your environment\n" - msg += "See documentation for more information" - raise SSConfigError(msg) from e + raise SSConfigError( + "SmartSim not installed with pre-built extensions (Redis)\n" + "Use the `smart` cli tool to install needed extensions\n" + "or set REDIS_PATH and REDIS_CLI_PATH in your environment\n" + "See documentation for more information" + ) from e - if launcher != "local": + if self.launcher != "local": self.batch_settings = self._build_batch_settings( db_nodes, alloc or "", batch, account or "", time or "", - launcher=launcher, + launcher=self.launcher, **kwargs, ) if hosts: self.set_hosts(hosts) - elif not hosts and run_command == "mpirun": + elif not hosts and self.run_command == "mpirun": raise SmartSimError( "hosts argument is required when launching Orchestrator with mpirun" ) @@ -260,6 +262,15 @@ def __init__( self._reserved_batch_args: t.Dict[t.Type[BatchSettings], t.List[str]] = {} self._fill_reserved() + @property + def db_identifier(self) -> str: + """Return the DB identifier, which is common to a DB and all of its nodes + + :return: DB identifier + :rtype: str + """ + return self.name + @property def num_shards(self) -> int: """Return the number of DB shards contained in the orchestrator. @@ -269,7 +280,19 @@ def num_shards(self) -> int: :returns: num_shards :rtype: int """ - return self.db_nodes + return sum(node.num_shards for node in self.entities) + + @property + def db_nodes(self) -> int: + """Read only property for the number of nodes an ``Orchestrator`` is + launched across. Notice that SmartSim currently assumes that each shard + will be launched on its own node. Therefore this property is currently + an alias to the ``num_shards`` attribute. + + :returns: Number of database nodes + :rtype: int + """ + return self.num_shards @property def hosts(self) -> t.List[str]: @@ -285,10 +308,19 @@ def hosts(self) -> t.List[str]: self._hosts = self._get_db_hosts() return self._hosts + def reset_hosts(self) -> None: + """Clear hosts or reset them to last user choice""" + for node in self.entities: + node.clear_hosts() + self._hosts = [] + # This is only needed on LSF + if self._user_hostlist: + self.set_hosts(self._user_hostlist) + def remove_stale_files(self) -> None: """Can be used to remove database files of a previous launch""" - for db in self.dbnodes: + for db in self.entities: db.remove_stale_dbnode_files() def get_address(self) -> t.List[str]: @@ -306,10 +338,10 @@ def get_address(self) -> t.List[str]: return self._get_address() def _get_address(self) -> t.List[str]: - addresses: t.List[str] = [] - for ip_address, port in itertools.product(self._hosts, self.ports): - addresses.append(":".join((ip_address, str(port)))) - return addresses + return [ + f"{host}:{port}" + for host, port in itertools.product(self._hosts, self.ports) + ] def is_active(self) -> bool: """Check if the database is active @@ -323,20 +355,21 @@ def is_active(self) -> bool: return db_is_active(self._hosts, self.ports, self.num_shards) @property - def _rai_module(self) -> str: + def _rai_module(self) -> t.Tuple[str, ...]: """Get the RedisAI module from third-party installations - :return: path to module or "" if not found - :rtype: str + :return: Tuple of args to pass to the orchestrator exe + to load and configure the RedisAI + :rtype: tuple[str] """ module = ["--loadmodule", CONFIG.redisai] if self.queue_threads: - module.append(f"THREADS_PER_QUEUE {self.queue_threads}") + module.extend(("THREADS_PER_QUEUE", str(self.queue_threads))) if self.inter_threads: - module.append(f"INTER_OP_PARALLELISM {self.inter_threads}") + module.extend(("INTER_OP_PARALLELISM", str(self.inter_threads))) if self.intra_threads: - module.append(f"INTRA_OP_PARALLELISM {self.intra_threads}") - return " ".join(module) + module.extend(("INTRA_OP_PARALLELISM", str(self.intra_threads))) + return tuple(module) @property def _redis_exe(self) -> str: @@ -365,7 +398,7 @@ def set_cpus(self, num_cpus: int) -> None: if hasattr(self.batch_settings, "set_cpus_per_task"): self.batch_settings.set_cpus_per_task(num_cpus) - for db in self.dbnodes: + for db in self.entities: db.run_settings.set_cpus_per_task(num_cpus) if db.is_mpmd and hasattr(db.run_settings, "mpmd"): for mpmd in db.run_settings.mpmd: @@ -386,7 +419,7 @@ def set_walltime(self, walltime: str) -> None: if hasattr(self, "batch_settings") and self.batch_settings: self.batch_settings.set_walltime(walltime) - def set_hosts(self, host_list: t.List[str]) -> None: + def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: """Specify the hosts for the ``Orchestrator`` to launch on :param host_list: list of host (compute node names) @@ -399,16 +432,24 @@ def set_hosts(self, host_list: t.List[str]) -> None: raise TypeError("host_list argument must be a list of strings") if not all(isinstance(host, str) for host in host_list): raise TypeError("host_list argument must be list of strings") + self._user_hostlist = host_list.copy() # TODO check length if self.batch: if hasattr(self, "batch_settings") and self.batch_settings: self.batch_settings.set_hostlist(host_list) if self.launcher == "lsf": - for db in self.dbnodes: + for db in self.entities: db.set_hosts(host_list) + elif ( + self.launcher == "pals" + and isinstance(self.entities[0].run_settings, PalsMpiexecSettings) + and self.entities[0].is_mpmd + ): + # In this case, --hosts is a global option, set it to first run command + self.entities[0].run_settings.set_hostlist(host_list) else: - for host, db in zip(host_list, self.dbnodes): + for host, db in zip(host_list, self.entities): if isinstance(db.run_settings, AprunSettings): if not self.batch: db.run_settings.set_hostlist([host]) @@ -416,8 +457,8 @@ def set_hosts(self, host_list: t.List[str]) -> None: db.run_settings.set_hostlist([host]) if db.is_mpmd and hasattr(db.run_settings, "mpmd"): - for i, mpmd_runsettings in enumerate(db.run_settings.mpmd): - mpmd_runsettings.set_hostlist(host_list[i + 1]) + for i, mpmd_runsettings in enumerate(db.run_settings.mpmd, 1): + mpmd_runsettings.set_hostlist(host_list[i]) def set_batch_arg(self, arg: str, value: t.Optional[str] = None) -> None: """Set a batch argument the orchestrator should launch with @@ -462,26 +503,24 @@ def set_run_arg(self, arg: str, value: t.Optional[str] = None) -> None: "it is a reserved keyword in Orchestrator" ) else: - for db in self.dbnodes: + for db in self.entities: db.run_settings.run_args[arg] = value if db.is_mpmd and hasattr(db.run_settings, "mpmd"): for mpmd in db.run_settings.mpmd: mpmd.run_args[arg] = value def enable_checkpoints(self, frequency: int) -> None: - """Sets the database's save configuration to save the - DB every 'frequency' seconds given that at least one - write operation against the DB occurred in that time. - For example, if `frequency` is 900, then the database - will save to disk after 900 seconds if there is at least - 1 change to the dataset. + """Sets the database's save configuration to save the DB every 'frequency' + seconds given that at least one write operation against the DB occurred in + that time. E.g., if `frequency` is 900, then the database will save to disk + after 900 seconds if there is at least 1 change to the dataset. :param frequency: the given number of seconds before the DB saves :type frequency: int """ - self.set_db_conf("save", str(frequency) + " 1") + self.set_db_conf("save", f"{frequency} 1") - def set_max_memory(self, mem: int) -> None: + def set_max_memory(self, mem: str) -> None: """Sets the max memory configuration. By default there is no memory limit. Setting max memory to zero also results in no memory limit. Once a limit is surpassed, keys will be removed according to the eviction strategy. The @@ -495,7 +534,6 @@ def set_max_memory(self, mem: int) -> None: :param mem: the desired max memory size e.g. 3gb :type mem: str - :raises SmartSimError: If 'mem' is an invalid memory value :raises SmartSimError: If database is not active """ @@ -508,7 +546,6 @@ def set_eviction_strategy(self, strategy: str) -> None: :param strategy: The max memory policy to use e.g. "volatile-lru", "allkeys-lru", etc. :type strategy: str - :raises SmartSimError: If 'strategy' is an invalid maxmemory policy :raises SmartSimError: If database is not active """ @@ -553,11 +590,18 @@ def set_db_conf(self, key: str, value: t.Union[int, str]) -> None: addresses = [] for host in self.hosts: for port in self.ports: - address = ":".join([get_ip_from_host(host), str(port)]) - addresses.append(address) + addresses.append(":".join([get_ip_from_host(host), str(port)])) + + db_name, name = unpack_db_identifier(self.db_identifier, "_") + + environ[f"SSDB{db_name}"] = addresses[0] + + db_type = CLUSTERED if self.num_shards > 2 else STANDALONE + environ[f"SR_DB_TYPE{db_name}"] = db_type + + options = ConfigOptions.create_from_environment(name) + client = Client(options) - is_cluster = self.num_shards > 2 - client = Client(address=addresses[0], cluster=is_cluster) try: for address in addresses: client.config_set(key, value, address) @@ -584,10 +628,14 @@ def _build_batch_settings( batch: bool, account: str, time: str, + *, + launcher: t.Optional[str] = None, **kwargs: t.Any, ) -> t.Optional[BatchSettings]: batch_settings = None - launcher = kwargs.pop("launcher") + + if launcher is None: + raise ValueError("Expected param `launcher` of type `str`") # enter this conditional if user has not specified an allocation to run # on or if user specified batch=False (alloc will be found through env) @@ -599,11 +647,16 @@ def _build_batch_settings( return batch_settings def _build_run_settings( - self, exe: str, exe_args: t.List[t.List[str]], **kwargs: t.Any + self, + exe: str, + exe_args: t.List[t.List[str]], + *, + run_args: t.Optional[t.Dict[str, t.Any]] = None, + db_nodes: int = 1, + single_cmd: bool = True, + **kwargs: t.Any, ) -> RunSettings: - run_args = kwargs.pop("run_args", {}) - db_nodes = kwargs.get("db_nodes", 1) - single_cmd = kwargs.get("single_cmd", True) + run_args = {} if run_args is None else run_args mpmd_nodes = single_cmd and db_nodes > 1 if mpmd_nodes: @@ -632,20 +685,28 @@ def _build_run_settings( if self.launcher != "local": run_settings.set_tasks_per_node(1) - # Put it back in case it is needed again - kwargs["run_args"] = run_args - return run_settings @staticmethod def _build_run_settings_lsf( - exe: str, exe_args: t.List[t.List[str]], **kwargs: t.Any + exe: str, + exe_args: t.List[t.List[str]], + *, + run_args: t.Optional[t.Dict[str, t.Any]] = None, + cpus_per_shard: t.Optional[int] = None, + gpus_per_shard: t.Optional[int] = None, + **_kwargs: t.Any, # Needed to ensure no API break and do not want to + # introduce that possibility, even if this method is + # protected, without running the test suite. ) -> t.Optional[JsrunSettings]: - run_args = kwargs.pop("run_args", {}) - cpus_per_shard = kwargs.get("cpus_per_shard", None) - gpus_per_shard = kwargs.get("gpus_per_shard", None) + run_args = {} if run_args is None else run_args erf_rs: t.Optional[JsrunSettings] = None + if cpus_per_shard is None: + raise ValueError("Expected an integer number of cpus per shard") + if gpus_per_shard is None: + raise ValueError("Expected an integer number of gpus per shard") + # We always run the DB on cpus 0:cpus_per_shard-1 # and gpus 0:gpus_per_shard-1 for shard_id, args in enumerate(exe_args): @@ -666,9 +727,9 @@ def _build_run_settings_lsf( } if gpus_per_shard > 1: # pragma: no-cover - erf_sets["gpu"] = "{" + f"0-{gpus_per_shard-1}" + "}" + erf_sets["gpu"] = f"{{0-{gpus_per_shard-1}}}" elif gpus_per_shard > 0: - erf_sets["gpu"] = "{" + str(0) + "}" + erf_sets["gpu"] = "{0}" run_settings.set_erf_sets(erf_sets) @@ -678,31 +739,35 @@ def _build_run_settings_lsf( erf_rs.make_mpmd(run_settings) - kwargs["run_args"] = run_args - return erf_rs - def _initialize_entities(self, **kwargs: t.Any) -> None: - self.db_nodes = int(kwargs.get("db_nodes", 1)) - single_cmd = kwargs.get("single_cmd", True) - - if int(self.db_nodes) == 2: + def _initialize_entities( + self, + *, + db_nodes: int = 1, + single_cmd: bool = True, + port: int = 6379, + **kwargs: t.Any, + ) -> None: + db_nodes = int(db_nodes) + if db_nodes == 2: raise SSUnsupportedError("Orchestrator does not support clusters of size 2") - if self.launcher == "local" and self.db_nodes > 1: + if self.launcher == "local" and db_nodes > 1: raise ValueError( "Local Orchestrator does not support multiple database shards" ) - mpmd_nodes = (single_cmd and self.db_nodes > 1) or self.launcher == "lsf" + mpmd_nodes = (single_cmd and db_nodes > 1) or self.launcher == "lsf" if mpmd_nodes: - self._initialize_entities_mpmd(**kwargs) + self._initialize_entities_mpmd( + db_nodes=db_nodes, single_cmd=single_cmd, port=port, **kwargs + ) else: - port = kwargs.get("port", 6379) - cluster = not bool(self.db_nodes < 3) + cluster = db_nodes >= 3 - for db_id in range(self.db_nodes): + for db_id in range(db_nodes): db_node_name = "_".join((self.name, str(db_id))) # create the exe_args list for launching multiple databases @@ -710,11 +775,10 @@ def _initialize_entities(self, **kwargs: t.Any) -> None: start_script_args = self._get_start_script_args( db_node_name, port, cluster ) - # if only launching 1 db per command, we don't need a # list of exe args lists run_settings = self._build_run_settings( - sys.executable, [start_script_args], **kwargs + sys.executable, [start_script_args], port=port, **kwargs ) node = DBNode( @@ -723,18 +787,20 @@ def _initialize_entities(self, **kwargs: t.Any) -> None: run_settings, [port], [db_node_name + ".out"], + self.db_identifier, ) self.entities.append(node) self.ports = [port] - def _initialize_entities_mpmd(self, **kwargs: t.Any) -> None: - port = kwargs.get("port", 6379) - cluster = not bool(self.db_nodes < 3) - + def _initialize_entities_mpmd( + self, *, db_nodes: int = 1, port: int = 6379, **kwargs: t.Any + ) -> None: + cluster = db_nodes >= 3 + mpmd_node_name = self.name + "_0" exe_args_mpmd: t.List[t.List[str]] = [] - for db_id in range(self.db_nodes): + for db_id in range(db_nodes): db_shard_name = "_".join((self.name, str(db_id))) # create the exe_args list for launching multiple databases # per node. also collect port range for dbnode @@ -743,70 +809,51 @@ def _initialize_entities_mpmd(self, **kwargs: t.Any) -> None: ) exe_args = " ".join(start_script_args) exe_args_mpmd.append(sh_split(exe_args)) - run_settings: t.Optional[RunSettings] = None - if self.launcher == "lsf": run_settings = self._build_run_settings_lsf( - sys.executable, exe_args_mpmd, **kwargs + sys.executable, exe_args_mpmd, db_nodes=db_nodes, port=port, **kwargs ) - output_files = [ - "_".join((self.name, str(db_id))) + ".out" - for db_id in range(self.db_nodes) - ] + output_files = [f"{self.name}_{db_id}.out" for db_id in range(db_nodes)] else: run_settings = self._build_run_settings( - sys.executable, exe_args_mpmd, **kwargs + sys.executable, exe_args_mpmd, db_nodes=db_nodes, port=port, **kwargs ) - output_files = [self.name + ".out"] - + output_files = [mpmd_node_name + ".out"] if not run_settings: raise ValueError(f"Could not build run settings for {self.launcher}") - - node = DBNode(self.name, self.path, run_settings, [port], output_files) - node.is_mpmd = True - node.num_shards = self.db_nodes + node = DBNode( + mpmd_node_name, + self.path, + run_settings, + [port], + output_files, + db_identifier=self.db_identifier, + ) self.entities.append(node) - self.ports = [port] - @staticmethod - def _get_cluster_args(name: str, port: int) -> t.List[str]: - """Create the arguments necessary for cluster creation""" - cluster_conf = "".join(("nodes-", name, "-", str(port), ".conf")) - db_args = ["--cluster-enabled yes", "--cluster-config-file", cluster_conf] - return db_args - def _get_start_script_args( self, name: str, port: int, cluster: bool ) -> t.List[str]: - start_script_args = [ + cmd = [ "-m", "smartsim._core.entrypoints.redis", # entrypoint - "+ifname=" + ",".join(self._interfaces), # pass interface to start script - "+command", # command flag for argparser - self._redis_exe, # redis-server - self._redis_conf, # redis.conf file - self._rai_module, # redisai.so - "--port", # redis port - str(port), # port number + f"+orc-exe={self._redis_exe}", # redis-server + f"+conf-file={self._redis_conf}", # redis.conf file + "+rai-module", # load redisai.so + *self._rai_module, + f"+name={name}", # name of node + f"+port={port}", # redis port + f"+ifname={','.join(self._interfaces)}", # pass interface to start script ] if cluster: - start_script_args += self._get_cluster_args(name, port) - - return start_script_args - - @property - def dbnodes(self) -> t.List[DBNode]: - """ - Helper property to cast self.entities to DBNode type for type correctness - """ - dbnodes = [node for node in self.entities if isinstance(node, DBNode)] - return dbnodes + cmd.append("+cluster") # is the shard part of a cluster + return cmd def _get_db_hosts(self) -> t.List[str]: hosts = [] - for db in self.dbnodes: + for db in self.entities: if not db.is_mpmd: hosts.append(db.host) else: diff --git a/smartsim/entity/__init__.py b/smartsim/entity/__init__.py index 3cc07f202..4ec28f2d4 100644 --- a/smartsim/entity/__init__.py +++ b/smartsim/entity/__init__.py @@ -28,6 +28,6 @@ from .dbobject import * from .ensemble import Ensemble from .entity import SmartSimEntity +from .entityList import EntityList, EntitySequence from .files import TaggedFilesHierarchy -from .entityList import EntityList from .model import Model diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index bc04df834..403984d16 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -24,16 +24,20 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import fileinput +import itertools +import json import os import os.path as osp import time import typing as t +from dataclasses import dataclass +from .._core.config import CONFIG from ..error import SmartSimError from ..log import get_logger -from .entity import SmartSimEntity from ..settings.base import RunSettings - +from .entity import SmartSimEntity logger = get_logger(__name__) @@ -54,13 +58,11 @@ def __init__( run_settings: RunSettings, ports: t.List[int], output_files: t.List[str], + db_identifier: str = "", ) -> None: """Initialize a database node within an orchestrator.""" - self.ports = ports - self._host: t.Optional[str] = None super().__init__(name, path, run_settings) - self._mpmd = False - self._num_shards: int = 0 + self.ports = ports self._hosts: t.Optional[t.List[str]] = None if not output_files: @@ -70,20 +72,25 @@ def __init__( ): raise ValueError("output_files must be of type list[str]") self._output_files = output_files + self.db_identifier = db_identifier @property def num_shards(self) -> int: - return self._num_shards + if not hasattr(self.run_settings, "mpmd"): + # return default number of shards if mpmd is not set + return 1 - @num_shards.setter - def num_shards(self, value: int) -> None: - self._num_shards = value + return len(self.run_settings.mpmd) + 1 @property def host(self) -> str: - if not self._host: - self._host = self._parse_db_host() - return self._host + try: + (host,) = self.hosts + except ValueError: + raise ValueError( + f"Multiple hosts detected for this DB Node: {', '.join(self.hosts)}" + ) from None + return host @property def hosts(self) -> t.List[str]: @@ -91,16 +98,16 @@ def hosts(self) -> t.List[str]: self._hosts = self._parse_db_hosts() return self._hosts + def clear_hosts(self) -> None: + self._hosts = None + @property def is_mpmd(self) -> bool: - return self._mpmd - - @is_mpmd.setter - def is_mpmd(self, value: bool) -> None: - self._mpmd = value + if not hasattr(self.run_settings, "mpmd"): + # missing mpmd property guarantees this is not an mpmd run + return False - def set_host(self, host: str) -> None: - self._host = str(host) + return bool(self.run_settings.mpmd) def set_hosts(self, hosts: t.List[str]) -> None: self._hosts = [str(host) for host in hosts] @@ -112,42 +119,27 @@ def remove_stale_dbnode_files(self) -> None: """ for port in self.ports: - if not self._mpmd: - conf_file = osp.join(self.path, self._get_cluster_conf_filename(port)) + for conf_file in ( + osp.join(self.path, filename) + for filename in self._get_cluster_conf_filenames(port) + ): if osp.exists(conf_file): os.remove(conf_file) - else: # cov-lsf - conf_files = [ - osp.join(self.path, filename) - for filename in self._get_cluster_conf_filenames(port) - ] - for conf_file in conf_files: - if osp.exists(conf_file): - os.remove(conf_file) for file_ending in [".err", ".out", ".mpmd"]: file_name = osp.join(self.path, self.name + file_ending) if osp.exists(file_name): os.remove(file_name) - if self._mpmd: + + if self.is_mpmd: for file_ending in [".err", ".out"]: - for shard_id in range(self._num_shards): + for shard_id in range(self.num_shards): file_name = osp.join( self.path, self.name + "_" + str(shard_id) + file_ending ) if osp.exists(file_name): os.remove(file_name) - def _get_cluster_conf_filename(self, port: int) -> str: - """Returns the .conf file name for the given port number - - :param port: port number - :type port: int - :return: the dbnode configuration file name - :rtype: str - """ - return "".join(("nodes-", self.name, "-", str(port), ".conf")) - def _get_cluster_conf_filenames(self, port: int) -> t.List[str]: # cov-lsf """Returns the .conf file name for the given port number @@ -158,108 +150,98 @@ def _get_cluster_conf_filenames(self, port: int) -> t.List[str]: # cov-lsf :return: the dbnode configuration file name :rtype: str """ + if self.num_shards == 1: + return [f"nodes-{self.name}-{port}.conf"] return [ - "".join(("nodes-", self.name + f"_{shard_id}", "-", str(port), ".conf")) - for shard_id in range(self._num_shards) + f"nodes-{self.name}_{shard_id}-{port}.conf" + for shard_id in range(self.num_shards) ] @staticmethod - def _parse_ips(filepath: str, num_ips: t.Optional[int] = None) -> t.List[str]: - ips = [] - with open(filepath, "r", encoding="utf-8") as dbnode_file: - lines = dbnode_file.readlines() - for line in lines: - content = line.split() - if "IPADDRESS:" in content: - ips.append(content[-1]) - if num_ips and len(ips) == num_ips: - break - - return ips - - def _parse_db_host(self, filepath: t.Optional[str] = None) -> str: - """Parse the database host/IP from the output file - - If no file is passed as argument, then the first - file in self._output_files is used. - - :param filepath: Path to file to parse - :type filepath: str, optional - :raises SmartSimError: if host/ip could not be found - :return: ip address | hostname - :rtype: str + def _parse_launched_shard_info_from_iterable( + stream: t.Iterable[str], num_shards: t.Optional[int] = None + ) -> "t.List[LaunchedShardData]": + lines = (line.strip() for line in stream) + lines = (line for line in lines if line) + tokenized = (line.split(maxsplit=1) for line in lines) + tokenized = (tokens for tokens in tokenized if len(tokens) > 1) + shard_data_jsons = ( + kwjson for first, kwjson in tokenized if "SMARTSIM_ORC_SHARD_INFO" in first + ) + shard_data_kwargs = (json.loads(kwjson) for kwjson in shard_data_jsons) + shard_data: "t.Iterable[LaunchedShardData]" = ( + LaunchedShardData(**kwargs) for kwargs in shard_data_kwargs + ) + if num_shards: + shard_data = itertools.islice(shard_data, num_shards) + return list(shard_data) + + @classmethod + def _parse_launched_shard_info_from_files( + cls, file_paths: t.List[str], num_shards: t.Optional[int] = None + ) -> "t.List[LaunchedShardData]": + with fileinput.FileInput(file_paths) as ifstream: + return cls._parse_launched_shard_info_from_iterable(ifstream, num_shards) + + def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": + """Parse the launched database shard info from the output files + + :raises SmartSimError: if all shard info could not be found + :return: The found launched shard info + :rtype: list[LaunchedShardData] """ - if not filepath: - filepath = osp.join(self.path, self._output_files[0]) - trials = 5 - ip_address = None - - # try a few times to give the database files time to - # populate on busy systems. - while not ip_address and trials > 0: + ips: "t.List[LaunchedShardData]" = [] + trials = CONFIG.database_file_parse_trials + interval = CONFIG.database_file_parse_interval + output_files = [osp.join(self.path, file) for file in self._output_files] + + while len(ips) < self.num_shards and trials > 0: try: - if ip_addresses := self._parse_ips(filepath, 1): - ip_address = ip_addresses[0] - # suppress error + ips = self._parse_launched_shard_info_from_files( + output_files, self.num_shards + ) except FileNotFoundError: - pass - - logger.debug("Waiting for Redis output files to populate...") - if not ip_address: - time.sleep(1) + ... + if len(ips) < self.num_shards: + logger.debug("Waiting for output files to populate...") + time.sleep(interval) trials -= 1 - if not ip_address: - logger.error(f"IP address lookup strategy failed for file {filepath}.") - raise SmartSimError("Failed to obtain database hostname") - - return ip_address + if len(ips) < self.num_shards: + msg = ( + f"Failed to parse the launched DB shard information from file(s) " + f"{', '.join(output_files)}. Found the information for " + f"{len(ips)} out of {self.num_shards} DB shards." + ) + logger.error(msg) + raise SmartSimError(msg) + return ips def _parse_db_hosts(self) -> t.List[str]: """Parse the database hosts/IPs from the output files - this uses the RedisIP module that is built as a dependency The IP address is preferred, but if hostname is only present then a lookup to /etc/hosts is done through the socket library. - This function must be called only if ``_mpmd==True``. :raises SmartSimError: if host/ip could not be found :return: ip addresses | hostnames :rtype: list[str] """ - ips: t.List[str] = [] - - # Find out if all shards' output streams are piped to separate files - if len(self._output_files) > 1: - for output_file in self._output_files: - filepath = osp.join(self.path, output_file) - _ = self._parse_db_host(filepath) - else: - filepath = osp.join(self.path, self._output_files[0]) - trials = 10 - ips = [] - while len(ips) < self._num_shards and trials > 0: - ips = [] - try: - ip_address = self._parse_ips(filepath, self._num_shards) - ips.extend(ip_address) - - # suppress error - except FileNotFoundError: - pass - - if len(ips) < self._num_shards: - logger.debug("Waiting for RedisIP files to populate...") - # Larger sleep time, as this seems to be needed for - # multihost setups - time.sleep(2) - trials -= 1 - - if len(ips) < self._num_shards: - msg = f"IP address lookup strategy failed for file {filepath}. " - msg += f"Found {len(ips)} out of {self._num_shards} IPs." - logger.error(msg) - raise SmartSimError("Failed to obtain database hostname") - - ips = list(dict.fromkeys(ips)) - return ips + return list({shard.hostname for shard in self.get_launched_shard_info()}) + + +@dataclass(frozen=True) +class LaunchedShardData: + """Data class to write and parse data about a launched database shard""" + + name: str + hostname: str + port: int + cluster: bool + + @property + def cluster_conf_file(self) -> t.Optional[str]: + return f"nodes-{self.name}-{self.port}.conf" if self.cluster else None + + def to_dict(self) -> t.Dict[str, t.Any]: + return dict(self.__dict__) diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index 7fe2051ea..bebedb12c 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -25,12 +25,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t - from pathlib import Path + from .._core.utils import init_default from ..error import SSUnsupportedError - __all__ = ["DBObject", "DBModel", "DBScript"] @@ -46,17 +45,19 @@ def __init__( file_path: t.Optional[str], device: t.Literal["CPU", "GPU"], devices_per_node: int, + first_device: int, ) -> None: self.name = name self.func = func - self.file: t.Optional[ - Path - ] = None # Need to have this explicitly to check on it + self.file: t.Optional[Path] = ( + None # Need to have this explicitly to check on it + ) if file_path: self.file = self._check_filepath(file_path) self.device = self._check_device(device) self.devices_per_node = devices_per_node - self._check_devices(device, devices_per_node) + self.first_device = first_device + self._check_devices(device, devices_per_node, first_device) @property def devices(self) -> t.List[str]: @@ -118,16 +119,30 @@ def _enumerate_devices(self) -> t.List[str]: if self.device == "GPU" and self.devices_per_node > 1: return [ - f"{self.device}:{str(device_num)}" - for device_num in range(self.devices_per_node) + f"{self.device}:{device_num}" + for device_num in range( + self.first_device, self.first_device + self.devices_per_node + ) ] return [self.device] @staticmethod def _check_devices( - device: t.Literal["CPU", "GPU"], devices_per_node: int + device: t.Literal["CPU", "GPU"], + devices_per_node: int, + first_device: int, ) -> None: + if device == "CPU" and devices_per_node > 1: + raise SSUnsupportedError( + "Cannot set devices_per_node>1 if CPU is specified under devices" + ) + + if device == "CPU" and first_device > 0: + raise SSUnsupportedError( + "Cannot set first_device>0 if CPU is specified under devices" + ) + if devices_per_node == 1: return @@ -136,10 +151,6 @@ def _check_devices( msg += f"the device was set to {device} and \ devices_per_node=={devices_per_node}" raise ValueError(msg) - if device == "CPU": - raise SSUnsupportedError( - "Cannot set devices_per_node>1 if CPU is specified under devices" - ) class DBScript(DBObject): @@ -150,6 +161,7 @@ def __init__( script_path: t.Optional[str] = None, device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, + first_device: int = 0, ): """TorchScript code represenation @@ -157,7 +169,9 @@ def __init__( present, a number can be passed for specification e.g. "GPU:1". Setting ``devices_per_node=N``, with N greater than one will result - in the model being stored on the first N devices of type ``device``. + in the script being stored on the first N devices of type ``device``; + additionally setting ``first_device=M`` will instead result in the + script being stored on devices M through M + N -1. One of either script (in memory representation) or script_path (file) must be provided @@ -172,8 +186,12 @@ def __init__( :type device: str, optional :param devices_per_node: number of devices to store the script on :type devices_per_node: int + :param first_device: first devices to store the script on + :type first_device: int """ - super().__init__(name, script, script_path, device, devices_per_node) + super().__init__( + name, script, script_path, device, devices_per_node, first_device + ) if not script and not script_path: raise ValueError("Either script or script_path must be provided") @@ -191,6 +209,8 @@ def __str__(self) -> str: "s per node\n" if self.devices_per_node > 1 else " per node\n" ) desc_str += "Devices: " + str(self.devices_per_node) + " " + devices_str + if self.first_device > 0: + desc_str += "First device: " + str(self.first_device) + "\n" return desc_str @@ -203,6 +223,7 @@ def __init__( model_file: t.Optional[str] = None, device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, + first_device: int = 0, batch_size: int = 0, min_batch_size: int = 0, min_batch_timeout: int = 0, @@ -227,6 +248,8 @@ def __init__( :type device: str, optional :param devices_per_node: number of devices to store the model on :type devices_per_node: int + :param first_device: The first device to store the model on + :type first_device: int :param batch_size: batch size for execution, defaults to 0 :type batch_size: int, optional :param min_batch_size: minimum batch size for model execution, defaults to 0 @@ -240,7 +263,9 @@ def __init__( :param outputs: model outupts (TF only), defaults to None :type outputs: list[str], optional """ - super().__init__(name, model, model_file, device, devices_per_node) + super().__init__( + name, model, model_file, device, devices_per_node, first_device + ) self.backend = self._check_backend(backend) if not model and not model_file: raise ValueError("Either model or model_file must be provided") @@ -264,6 +289,8 @@ def __str__(self) -> str: "s per node\n" if self.devices_per_node > 1 else " per node\n" ) desc_str += "Devices: " + str(self.devices_per_node) + " " + devices_str + if self.first_device > 0: + desc_str += "First_device: " + str(self.first_device) + "\n" desc_str += "Backend: " + str(self.backend) + "\n" if self.batch_size: desc_str += "Batch size: " + str(self.batch_size) + "\n" diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index 366be2053..28ada31de 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -25,7 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t - from copy import deepcopy from os import getcwd @@ -41,9 +40,9 @@ from ..log import get_logger from ..settings.base import BatchSettings, RunSettings from .dbobject import DBModel, DBScript +from .entity import SmartSimEntity from .entityList import EntityList from .model import Model -from .entity import SmartSimEntity from .strategies import create_all_permutations, random_permutations, step_values logger = get_logger(__name__) @@ -53,7 +52,7 @@ ] -class Ensemble(EntityList): +class Ensemble(EntityList[Model]): """``Ensemble`` is a group of ``Model`` instances that can be treated as a reference to a single instance. """ @@ -362,8 +361,10 @@ def add_ml_model( model_path: t.Optional[str] = None, device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, + first_device: int = 0, batch_size: int = 0, min_batch_size: int = 0, + min_batch_timeout: int = 0, tag: str = "", inputs: t.Optional[t.List[str]] = None, outputs: t.Optional[t.List[str]] = None, @@ -387,10 +388,18 @@ def add_ml_model( :type backend: str :param device: name of device for execution, defaults to "CPU" :type device: str, optional + :param devices_per_node: number of GPUs per node in multiGPU nodes, + defaults to 1 + :type devices_per_node: int, optional + :param first_device: first device in multi-GPU nodes to use for execution, + defaults to 0; ignored if devices_per_node is 1 + :type first_device: int, optional :param batch_size: batch size for execution, defaults to 0 :type batch_size: int, optional :param min_batch_size: minimum batch size for model execution, defaults to 0 :type min_batch_size: int, optional + :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 + :type min_batch_timeout: int, optional :param tag: additional tag for model information, defaults to "" :type tag: str, optional :param inputs: model inputs (TF only), defaults to None @@ -405,8 +414,10 @@ def add_ml_model( model_file=model_path, device=device, devices_per_node=devices_per_node, + first_device=first_device, batch_size=batch_size, min_batch_size=min_batch_size, + min_batch_timeout=min_batch_timeout, tag=tag, inputs=inputs, outputs=outputs, @@ -422,6 +433,7 @@ def add_script( script_path: t.Optional[str] = None, device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, + first_device: int = 0, ) -> None: """TorchScript to launch with every entity belonging to this ensemble @@ -448,6 +460,8 @@ def add_script( :type device: str, optional :param devices_per_node: number of devices on each host :type devices_per_node: int + :param first_device: first device to use on each host + :type first_device: int """ db_script = DBScript( name=name, @@ -455,6 +469,7 @@ def add_script( script_path=script_path, device=device, devices_per_node=devices_per_node, + first_device=first_device, ) self._db_scripts.append(db_script) for entity in self.models: @@ -466,6 +481,7 @@ def add_function( function: t.Optional[str] = None, device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, + first_device: int = 0, ) -> None: """TorchScript function to launch with every entity belonging to this ensemble @@ -479,7 +495,9 @@ def add_function( present, a number can be passed for specification e.g. "GPU:1". Setting ``devices_per_node=N``, with N greater than one will result - in the model being stored in the first N devices of type ``device``. + in the script being stored in the first N devices of type ``device``; + alternatively, setting ``first_device=M`` will result in the script + being stored on nodes M through M + N - 1. :param name: key to store function under :type name: str @@ -489,9 +507,15 @@ def add_function( :type device: str, optional :param devices_per_node: number of devices on each host :type devices_per_node: int + :param first_device: first device to use on each host + :type first_device: int """ db_script = DBScript( - name=name, script=function, device=device, devices_per_node=devices_per_node + name=name, + script=function, + device=device, + devices_per_node=devices_per_node, + first_device=first_device, ) self._db_scripts.append(db_script) for entity in self.models: diff --git a/smartsim/entity/entity.py b/smartsim/entity/entity.py index fa46215cf..0d126c907 100644 --- a/smartsim/entity/entity.py +++ b/smartsim/entity/entity.py @@ -23,12 +23,17 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import smartsim.settings.base + +import typing as t + +if t.TYPE_CHECKING: + # pylint: disable-next=unused-import + import smartsim.settings.base class SmartSimEntity: def __init__( - self, name: str, path: str, run_settings: smartsim.settings.base.RunSettings + self, name: str, path: str, run_settings: "smartsim.settings.base.RunSettings" ) -> None: """Initialize a SmartSim entity. diff --git a/smartsim/entity/entityList.py b/smartsim/entity/entityList.py index 1d3a991d4..4eaf3faa0 100644 --- a/smartsim/entity/entityList.py +++ b/smartsim/entity/entityList.py @@ -26,20 +26,53 @@ import typing as t +from .entity import SmartSimEntity + if t.TYPE_CHECKING: - # pylint: disable=unused-import + # pylint: disable-next=unused-import import smartsim +_T = t.TypeVar("_T", bound=SmartSimEntity) +# Old style pyint from TF 2.6.x does not know about pep484 style ``TypeVar`` names +# pylint: disable-next=invalid-name +_T_co = t.TypeVar("_T_co", bound=SmartSimEntity, covariant=True) + -class EntityList: +class EntitySequence(t.Generic[_T_co]): """Abstract class for containers for SmartSimEntities""" def __init__(self, name: str, path: str, **kwargs: t.Any) -> None: self.name: str = name self.path: str = path - self.entities: t.List["smartsim.entity.SmartSimEntity"] = [] - self._db_models: t.List["smartsim.entity.DBModel"] = [] - self._db_scripts: t.List["smartsim.entity.DBScript"] = [] + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + # WARNING: This class cannot be made truly covariant until the + # following properties are made read-only. It is currently + # designed for in-house type checking only!! + # + # Despite the fact that these properties are type hinted as + # ``Sequence``s, the underlying types must remain ``list``s as that is + # what subclasses are expecting when implementing their + # ``_initialize_entities`` methods. + # + # I'm leaving it "as is" for now as to not introduce a potential API + # break in case any users subclassed the invariant version of this + # class (``EntityList``), but a "proper" solution would be to turn + # ``EntitySequence``/``EntityList`` into proper ``abc.ABC``s and have + # the properties we expect to be initialized represented as abstract + # properties. An additional benefit of this solution is would be that + # users could actually initialize their entities in the ``__init__`` + # method, and it would remove the need for the cumbersome and + # un-type-hint-able ``_initialize_entities`` method by returning all + # object construction into the class' constructor. + # --------------------------------------------------------------------- + # + self.entities: t.Sequence[_T_co] = [] + self._db_models: t.Sequence["smartsim.entity.DBModel"] = [] + self._db_scripts: t.Sequence["smartsim.entity.DBScript"] = [] + # + # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + self._initialize_entities(**kwargs) def _initialize_entities(self, **kwargs: t.Any) -> None: @@ -79,15 +112,35 @@ def set_path(self, new_path: str) -> None: for entity in self.entities: entity.path = new_path - def __getitem__(self, name: str) -> t.Optional["smartsim.entity.SmartSimEntity"]: + def __getitem__(self, name: str) -> t.Optional[_T_co]: for entity in self.entities: if entity.name == name: return entity return None - def __iter__(self) -> t.Iterator["smartsim.entity.SmartSimEntity"]: + def __iter__(self) -> t.Iterator[_T_co]: for entity in self.entities: yield entity def __len__(self) -> int: return len(self.entities) + + +class EntityList(EntitySequence[_T]): + """An invariant subclass of an ``EntitySequence`` with mutable containers""" + + def __init__(self, name: str, path: str, **kwargs: t.Any) -> None: + super().__init__(name, path, **kwargs) + # Change container types to be invariant ``list``s + self.entities: t.List[_T] = list(self.entities) + self._db_models: t.List["smartsim.entity.DBModel"] = list(self._db_models) + self._db_scripts: t.List["smartsim.entity.DBScript"] = list(self._db_scripts) + + def _initialize_entities(self, **kwargs: t.Any) -> None: + """Initialize the SmartSimEntity objects in the container""" + # Need to identically re-define this "abstract method" or pylint + # complains that we are trying to define a concrete implementation of + # an abstract class despite the fact that we want this class to also be + # abstract. All the more reason to turn both of these classes into + # ``abc.ABC``s in my opinion. + raise NotImplementedError diff --git a/smartsim/entity/files.py b/smartsim/entity/files.py index 300452073..3aae9402b 100644 --- a/smartsim/entity/files.py +++ b/smartsim/entity/files.py @@ -25,8 +25,8 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os import typing as t - from os import path + from tabulate import tabulate diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index a86be546b..6b97cbf2e 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -27,21 +27,23 @@ from __future__ import annotations import collections.abc +import re import sys import typing as t import warnings +from os import path as osp from .._core.utils.helpers import cat_arg_and_value, init_default from ..error import EntityExistsError, SSUnsupportedError +from ..log import get_logger +from ..settings.base import BatchSettings, RunSettings from .dbobject import DBModel, DBScript from .entity import SmartSimEntity from .files import EntityFiles -from ..settings.base import BatchSettings, RunSettings -from ..log import get_logger - logger = get_logger(__name__) + class Model(SmartSimEntity): def __init__( self, @@ -163,6 +165,19 @@ def attach_generator_files( to_copy = init_default([], to_copy, (list, str)) to_symlink = init_default([], to_symlink, (list, str)) to_configure = init_default([], to_configure, (list, str)) + + # Check that no file collides with the parameter file written + # by Generator. We check the basename, even though it is more + # restrictive than what we need (but it avoids relative path issues) + for strategy in [to_copy, to_symlink, to_configure]: + if strategy is not None and any( + osp.basename(filename) == "smartsim_params.txt" for filename in strategy + ): + raise ValueError( + "`smartsim_params.txt` is a file automatically " + + "generated by SmartSim and cannot be ovewritten." + ) + self.files = EntityFiles(to_configure, to_copy, to_symlink) @property @@ -177,8 +192,7 @@ def attached_files_table(self) -> str: return str(self.files) def print_attached_files(self) -> None: - """Print a table of the attached files on std out - """ + """Print a table of the attached files on std out""" print(self.attached_files_table) def colocate_db(self, *args: t.Any, **kwargs: t.Any) -> None: @@ -187,7 +201,8 @@ def colocate_db(self, *args: t.Any, **kwargs: t.Any) -> None: ( "`colocate_db` has been deprecated and will be removed in a \n" "future release. Please use `colocate_db_tcp` or `colocate_db_uds`." - ), FutureWarning + ), + FutureWarning, ) self.colocate_db_tcp(*args, **kwargs) @@ -198,6 +213,7 @@ def colocate_db_uds( db_cpus: int = 1, custom_pinning: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]] = None, debug: bool = False, + db_identifier: str = "", **kwargs: t.Any, ) -> None: """Colocate an Orchestrator instance with this Model over UDS. @@ -237,6 +253,12 @@ def colocate_db_uds( :type kwargs: dict, optional """ + if not re.match(r"^[a-zA-Z0-9.:\,_\-/]*$", unix_socket): + raise ValueError( + f"Invalid name for unix socket: {unix_socket}. Must only " + "contain alphanumeric characters or . : _ - /" + ) + uds_options = { "unix_socket": unix_socket, "socket_permissions": socket_permissions, @@ -247,6 +269,7 @@ def colocate_db_uds( "cpus": db_cpus, "custom_pinning": custom_pinning, "debug": debug, + "db_identifier": db_identifier, } self._set_colocated_db_settings(uds_options, common_options, **kwargs) @@ -257,6 +280,7 @@ def colocate_db_tcp( db_cpus: int = 1, custom_pinning: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]] = None, debug: bool = False, + db_identifier: str = "", **kwargs: t.Any, ) -> None: """Colocate an Orchestrator instance with this Model over TCP/IP. @@ -302,6 +326,7 @@ def colocate_db_tcp( "cpus": db_cpus, "custom_pinning": custom_pinning, "debug": debug, + "db_identifier": db_identifier, } self._set_colocated_db_settings(tcp_options, common_options, **kwargs) @@ -333,8 +358,7 @@ def _set_colocated_db_settings( # TODO list which db settings can be extras common_options["custom_pinning"] = self._create_pinning_string( - common_options["custom_pinning"], - common_options["cpus"] + common_options["custom_pinning"], common_options["cpus"] ) colo_db_config = {} @@ -358,13 +382,13 @@ def _set_colocated_db_settings( @staticmethod def _create_pinning_string( - pin_ids: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]], - cpus: int - ) -> t.Optional[str]: + pin_ids: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]], cpus: int + ) -> t.Optional[str]: """Create a comma-separated string CPU ids. By default, None returns 0,1,...,cpus-1; an empty iterable will disable pinning altogether, and an iterable constructs a comma separate string (e.g. 0,2,5) """ + def _stringify_id(_id: int) -> str: """Return the cPU id as a string if an int, otherwise raise a ValueError""" if isinstance(_id, int): @@ -389,14 +413,14 @@ def _stringify_id(_id: int) -> str: warnings.warn( "CPU pinning is not supported on MacOSX. Ignoring pinning " "specification.", - RuntimeWarning + RuntimeWarning, ) return None raise TypeError(_invalid_input_message) # Flatten the iterable into a list and check to make sure that the resulting # elements are all ints if pin_ids is None: - return ','.join(_stringify_id(i) for i in range(cpus)) + return ",".join(_stringify_id(i) for i in range(cpus)) if not pin_ids: return None if isinstance(pin_ids, collections.abc.Iterable): @@ -406,7 +430,7 @@ def _stringify_id(_id: int) -> str: pin_list.extend([_stringify_id(j) for j in pin_id]) else: pin_list.append(_stringify_id(pin_id)) - return ','.join(sorted(set(pin_list))) + return ",".join(sorted(set(pin_list))) raise TypeError(_invalid_input_message) def params_to_args(self) -> None: @@ -433,10 +457,12 @@ def add_ml_model( backend: str, model: t.Optional[str] = None, model_path: t.Optional[str] = None, - device: t.Literal["CPU","GPU"] = "CPU", + device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, + first_device: int = 0, batch_size: int = 0, min_batch_size: int = 0, + min_batch_timeout: int = 0, tag: str = "", inputs: t.Optional[t.List[str]] = None, outputs: t.Optional[t.List[str]] = None, @@ -462,12 +488,18 @@ def add_ml_model( :type device: str, optional :param devices_per_node: The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device - is specified as GPU. + is specified as CPU. :type devices_per_node: int + :param first_device: The first GPU device to use on the host. + This parameter only applies to GPU devices and will be ignored if device + is specified as CPU. + :type first_device: int :param batch_size: batch size for execution, defaults to 0 :type batch_size: int, optional :param min_batch_size: minimum batch size for model execution, defaults to 0 :type min_batch_size: int, optional + :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 + :type min_batch_timeout: int, optional :param tag: additional tag for model information, defaults to "" :type tag: str, optional :param inputs: model inputs (TF only), defaults to None @@ -482,8 +514,10 @@ def add_ml_model( model_file=model_path, device=device, devices_per_node=devices_per_node, + first_device=first_device, batch_size=batch_size, min_batch_size=min_batch_size, + min_batch_timeout=min_batch_timeout, tag=tag, inputs=inputs, outputs=outputs, @@ -495,8 +529,9 @@ def add_script( name: str, script: t.Optional[str] = None, script_path: t.Optional[str] = None, - device: t.Literal["CPU","GPU"] = "CPU", + device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, + first_device: int = 0, ) -> None: """TorchScript to launch with this Model instance @@ -508,7 +543,9 @@ def add_script( present, a number can be passed for specification e.g. "GPU:1". Setting ``devices_per_node=N``, with N greater than one will result - in the model being stored in the first N devices of type ``device``. + in the script being stored in the first N devices of type ``device``; + alternatively, setting ``first_device=M`` will result in the script + being stored on nodes M through M + N - 1. One of either script (in memory string representation) or script_path (file) must be provided @@ -523,8 +560,12 @@ def add_script( :type device: str, optional :param devices_per_node: The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device - is specified as GPU. + is specified as CPU. :type devices_per_node: int + :param first_device: The first GPU device to use on the host. + This parameter only applies to GPU devices and will be ignored if device + is specified as CPU. + :type first_device: int """ db_script = DBScript( name=name, @@ -532,6 +573,7 @@ def add_script( script_path=script_path, device=device, devices_per_node=devices_per_node, + first_device=first_device, ) self.add_script_object(db_script) @@ -539,8 +581,9 @@ def add_function( self, name: str, function: t.Optional[str] = None, - device: t.Literal["CPU","GPU"] = "CPU", + device: t.Literal["CPU", "GPU"] = "CPU", devices_per_node: int = 1, + first_device: int = 0, ) -> None: """TorchScript function to launch with this Model instance @@ -564,11 +607,19 @@ def add_function( :type device: str, optional :param devices_per_node: The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device - is specified as GPU. + is specified as CPU. :type devices_per_node: int + :param first_device: The first GPU device to use on the host. + This parameter only applies to GPU devices and will be ignored if device + is specified as CPU. + :type first_device: int """ db_script = DBScript( - name=name, script=function, device=device, devices_per_node=devices_per_node + name=name, + script=function, + device=device, + devices_per_node=devices_per_node, + first_device=first_device, ) self.add_script_object(db_script) diff --git a/smartsim/entity/strategies.py b/smartsim/entity/strategies.py index 7aec57765..e9db30c8f 100644 --- a/smartsim/entity/strategies.py +++ b/smartsim/entity/strategies.py @@ -27,7 +27,6 @@ # Generation Strategies import random import typing as t - from itertools import product diff --git a/smartsim/error/__init__.py b/smartsim/error/__init__.py index 383f82bb4..a04f5d91e 100644 --- a/smartsim/error/__init__.py +++ b/smartsim/error/__init__.py @@ -32,9 +32,9 @@ ShellError, SmartSimError, SSConfigError, + SSDBIDConflictError, SSInternalError, + SSReservedKeywordError, SSUnsupportedError, UserStrategyError, - SSReservedKeywordError, - ) diff --git a/smartsim/error/errors.py b/smartsim/error/errors.py index 95bc92420..d71ae3f71 100644 --- a/smartsim/error/errors.py +++ b/smartsim/error/errors.py @@ -39,12 +39,14 @@ class SSUnsupportedError(Exception): class EntityExistsError(SmartSimError): """Raised when a user tries to create an entity or files/directories for - an entity and either the entity/files/directories already exist""" + an entity and either the entity/files/directories already exist + """ class UserStrategyError(SmartSimError): """Raised when there is an error with model creation inside an ensemble - that is from a user provided permutation strategy""" + that is from a user provided permutation strategy + """ def __init__(self, perm_strat: str) -> None: message = self.create_message(perm_strat) @@ -79,13 +81,17 @@ class SSReservedKeywordError(SmartSimError): """Raised when a Reserved Keyword is used incorrectly""" +class SSDBIDConflictError(SmartSimError): + """Raised in the event that a database identifier + is not unique when multiple databases are created + """ + + # Internal Exceptions class SSInternalError(Exception): - """ - SSInternalError is raised when an internal error is encountered. - """ + """SSInternalError is raised when an internal error is encountered""" class SSConfigError(SSInternalError): @@ -102,7 +108,8 @@ class AllocationError(LauncherError): class ShellError(LauncherError): """Raised when error arises from function within launcher.shell - Closely related to error from subprocess(Popen) commands""" + Closely related to error from subprocess(Popen) commands + """ def __init__( self, @@ -126,3 +133,19 @@ def create_message( if details: msg += f"\nError from shell: {details}" return msg + + +class TelemetryError(SSInternalError): + """Raised when SmartSim runs into trouble establishing or communicating + telemetry information + """ + + +class UnproxyableStepError(TelemetryError): + """Raised when a user attempts to proxy a managed ``Step`` through the + unmanaged step proxy entry point + """ + + +class SmartSimCLIActionCancelled(SmartSimError): + """Raised when a `smart` CLI command is terminated""" diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 639d0e7c9..9de33419a 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os import os.path as osp import typing as t from os import getcwd @@ -36,7 +37,7 @@ from .entity import Ensemble, Model, SmartSimEntity from .error import SmartSimError from .log import get_logger -from .settings import settings, base, Container +from .settings import Container, base, settings from .wlm import detect_launcher logger = get_logger(__name__) @@ -62,7 +63,10 @@ class Experiment: """ def __init__( - self, name: str, exp_path: t.Optional[str] = None, launcher: str = "local" + self, + name: str, + exp_path: t.Optional[str] = None, + launcher: str = "local", ): """Initialize an Experiment instance @@ -126,6 +130,7 @@ def __init__( self._control = Controller(launcher=launcher) self._launcher = launcher.lower() + self.db_identifiers: t.Set[str] = set() def start( self, @@ -184,11 +189,14 @@ def start( :type kill_on_interrupt: bool, optional """ + start_manifest = Manifest(*args) try: if summary: self._launch_summary(start_manifest) self._control.start( + exp_name=self.name, + exp_path=self.exp_path, manifest=start_manifest, block=block, kill_on_interrupt=kill_on_interrupt, @@ -220,18 +228,25 @@ def stop(self, *args: t.Any) -> None: :raises TypeError: if wrong type :raises SmartSimError: if stop request fails """ + stop_manifest = Manifest(*args) try: - stop_manifest = Manifest(*args) for entity in stop_manifest.models: self._control.stop_entity(entity) - for entity_list in stop_manifest.all_entity_lists: + for entity_list in stop_manifest.ensembles: self._control.stop_entity_list(entity_list) + dbs = stop_manifest.dbs + for db in dbs: + self._control.stop_db(db) except SmartSimError as e: logger.error(e) raise def generate( - self, *args: t.Any, tag: t.Optional[str] = None, overwrite: bool = False + self, + *args: t.Any, + tag: t.Optional[str] = None, + overwrite: bool = False, + verbose: bool = False, ) -> None: """Generate the file structure for an ``Experiment`` @@ -251,9 +266,11 @@ def generate( :param overwrite: overwrite existing folders and contents, defaults to False :type overwrite: bool, optional + :param verbose: log parameter settings to std out + :type verbose: bool """ try: - generator = Generator(self.exp_path, overwrite=overwrite) + generator = Generator(self.exp_path, overwrite=overwrite, verbose=verbose) if tag: generator.set_tag(tag) generator.generate_experiment(*args) @@ -496,7 +513,7 @@ def create_model( "epoch": 10, "lr": 0.001 } - model = exp.create_model("pytorch_model", run_settings, params=params) + model = exp.create_model("pytorch_model", run_settings, params=train_params) model.attach_generator_files(to_configure="./train.cfg") exp.generate(model) @@ -682,13 +699,14 @@ def create_database( port: int = 6379, db_nodes: int = 1, batch: bool = False, - hosts: t.Optional[t.List[str]] = None, + hosts: t.Optional[t.Union[t.List[str], str]] = None, run_command: str = "auto", interface: str = "ipogif0", account: t.Optional[str] = None, time: t.Optional[str] = None, queue: t.Optional[str] = None, single_cmd: bool = True, + db_identifier: str = "orchestrator", **kwargs: t.Any, ) -> Orchestrator: """Initialize an Orchestrator database @@ -731,12 +749,18 @@ def create_database( :type queue: str, optional :param single_cmd: run all shards with one (MPMD) command, defaults to True :type single_cmd: bool, optional + :param db_identifier: an identifier to distinguish this orchestrator in + multiple-database experiments, defaults to "orchestrator" + :type db_identifier: str, optional :raises SmartSimError: if detection of launcher or of run command fails :raises SmartSimError: if user indicated an incompatible run command for the launcher :return: Orchestrator :rtype: Orchestrator or derived class """ + + self.append_to_db_identifier_list(db_identifier) + return Orchestrator( port=port, db_nodes=db_nodes, @@ -749,6 +773,7 @@ def create_database( queue=queue, single_cmd=single_cmd, launcher=self._launcher, + db_identifier=db_identifier, **kwargs, ) @@ -772,18 +797,17 @@ def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator: logger.error(e) raise - # pylint: disable-next=redefined-builtin - def summary(self, format: str = "github") -> str: + def summary(self, style: str = "github") -> str: """Return a summary of the ``Experiment`` The summary will show each instance that has been launched and completed in this ``Experiment`` - :param format: the style in which the summary table is formatted, + :param style: the style in which the summary table is formatted, for a full list of styles see: https://github.com/astanin/python-tabulate#table-format, defaults to "github" - :type format: str, optional + :type style: str, optional :return: tabulate string of ``Experiment`` history :rtype: str """ @@ -814,7 +838,7 @@ def summary(self, format: str = "github") -> str: values, headers, showindex=True, - tablefmt=format, + tablefmt=style, missingval="None", disable_numparse=True, ) @@ -835,7 +859,7 @@ def _launch_summary(self, manifest: Manifest) -> None: if self._control.orchestrator_active: summary += "Database Status: active\n" - elif manifest.db: + elif manifest.dbs: summary += "Database Status: launching\n" else: summary += "Database Status: inactive\n" @@ -846,3 +870,46 @@ def _launch_summary(self, manifest: Manifest) -> None: def __str__(self) -> str: return self.name + + def append_to_db_identifier_list(self, db_identifier: str) -> None: + """Check if db_identifier already exists when calling create_database""" + if db_identifier in self.db_identifiers: + logger.warning( + f"A database with the identifier {db_identifier} has already been made " + "An error will be raised if multiple databases are started " + "with the same identifier" + ) + # Otherwise, add + self.db_identifiers.add(db_identifier) + + def enable_telemetry(self) -> None: + """Experiments will start producing telemetry for all entities run + through ``Experiment.start`` + + .. warning:: + + This method is currently implemented so that ALL ``Experiment`` + instances will begin producing telemetry data. In the future it + is planned to have this method work on a "per instance" basis! + """ + self._set_telemetry(True) + + def disable_telemetry(self) -> None: + """Experiments will stop producing telemetry for all entities run + through ``Experiment.start`` + + .. warning:: + + This method is currently implemented so that ALL ``Experiment`` + instances will stop producing telemetry data. In the future it + is planned to have this method work on a "per instance" basis! + """ + self._set_telemetry(False) + + @staticmethod + def _set_telemetry(switch: bool, /) -> None: + tm_key = "SMARTSIM_FLAG_TELEMETRY" + if switch: + os.environ[tm_key] = "1" + else: + os.environ[tm_key] = "0" diff --git a/smartsim/log.py b/smartsim/log.py index 9011b3d1b..baf54f068 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -23,19 +23,23 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t import logging import os import sys +import typing as t import coloredlogs -# constants for logging -coloredlogs.DEFAULT_DATE_FORMAT = "%H:%M:%S" -coloredlogs.DEFAULT_LOG_FORMAT = ( +# constants +DEFAULT_DATE_FORMAT: t.Final[str] = "%H:%M:%S" +DEFAULT_LOG_FORMAT: t.Final[str] = ( "%(asctime)s %(hostname)s %(name)s[%(process)d] %(levelname)s %(message)s" ) +# configure colored loggs +coloredlogs.DEFAULT_DATE_FORMAT = DEFAULT_DATE_FORMAT +coloredlogs.DEFAULT_LOG_FORMAT = DEFAULT_LOG_FORMAT + def _get_log_level() -> str: """Get the logging level based on environment variable @@ -121,5 +125,7 @@ def log_to_file(filename: str, log_level: str = "debug") -> None: :type log_level: int | str """ logger = logging.getLogger("SmartSim") - stream = open(filename, "w+", encoding="utf-8") # pylint: disable=consider-using-with + stream = open( # pylint: disable=consider-using-with + filename, "w+", encoding="utf-8" + ) coloredlogs.install(stream=stream, logger=logger, level=log_level) diff --git a/smartsim/ml/data.py b/smartsim/ml/data.py index 8e617f914..3dfca9f0c 100644 --- a/smartsim/ml/data.py +++ b/smartsim/ml/data.py @@ -35,7 +35,6 @@ from ..error import SSInternalError from ..log import get_logger - logger = get_logger(__name__) @@ -180,7 +179,7 @@ def __init__( if not sample_name: raise ValueError("Sample name can not be empty") - self.client = Client(address=address, cluster=cluster) + self.client = Client(cluster, address=address) self.verbose = verbose self.batch_idx = 0 self.rank = rank @@ -323,7 +322,7 @@ def __init__( self._info = data_info_or_list_name elif isinstance(data_info_or_list_name, str): self._info = DataInfo(list_name=data_info_or_list_name) - client = Client(self.address, self.cluster) + client = Client(self.cluster, self.address) self._info.download(client) else: raise TypeError("data_info_or_list_name must be either DataInfo or str") @@ -410,7 +409,7 @@ def init_samples(self, init_trials: int = -1) -> None: :param init_trials: maximum number of attempts to fetch data :type init_trials: int """ - self._client = Client(self.address, self.cluster) + self._client = Client(self.cluster, self.address) num_trials = 0 max_trials = init_trials or -1 diff --git a/smartsim/ml/tf/data.py b/smartsim/ml/tf/data.py index 823553786..ae0b9aadd 100644 --- a/smartsim/ml/tf/data.py +++ b/smartsim/ml/tf/data.py @@ -24,8 +24,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import numpy as np import typing as t + +import numpy as np from tensorflow import keras from smartsim.ml import DataDownloader diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index 7ef8fb3c6..c8018ac32 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -24,11 +24,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import typing as t from pathlib import Path -import tensorflow as tf import keras -import typing as t +import tensorflow as tf from tensorflow.python.framework.convert_to_constants import ( convert_variables_to_constants_v2, ) diff --git a/smartsim/ml/torch/data.py b/smartsim/ml/torch/data.py index 69d054476..166a29e05 100644 --- a/smartsim/ml/torch/data.py +++ b/smartsim/ml/torch/data.py @@ -24,9 +24,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import typing as t + import numpy as np import torch -import typing as t from smartredis import Client, Dataset from smartsim.ml.data import DataDownloader @@ -45,7 +46,7 @@ def __init__(self, **kwargs: t.Any) -> None: def _add_samples(self, indices: t.List[int]) -> None: if self.client is None: - client = Client(self.address, self.cluster) + client = Client(self.cluster, self.address) else: client = self.client diff --git a/smartsim/servertype.py b/smartsim/servertype.py new file mode 100644 index 000000000..a83149c23 --- /dev/null +++ b/smartsim/servertype.py @@ -0,0 +1,30 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +# Server Types +CLUSTERED = "Clustered" +STANDALONE = "Standalone" diff --git a/smartsim/settings/alpsSettings.py b/smartsim/settings/alpsSettings.py index dc19dc475..b36c3d333 100644 --- a/smartsim/settings/alpsSettings.py +++ b/smartsim/settings/alpsSettings.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from __future__ import annotations + import typing as t from ..error import SSUnsupportedError diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index b1e57ad37..a6df4eed4 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -34,9 +34,11 @@ logger = get_logger(__name__) - +# fmt: off class SettingsBase: ... +# fmt: on + # pylint: disable=too-many-public-methods class RunSettings(SettingsBase): diff --git a/smartsim/settings/cobaltSettings.py b/smartsim/settings/cobaltSettings.py index d0ad1a05f..5a0e07b40 100644 --- a/smartsim/settings/cobaltSettings.py +++ b/smartsim/settings/cobaltSettings.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t + from .base import BatchSettings diff --git a/smartsim/settings/lsfSettings.py b/smartsim/settings/lsfSettings.py index eeb0e4bf1..47fe91802 100644 --- a/smartsim/settings/lsfSettings.py +++ b/smartsim/settings/lsfSettings.py @@ -99,9 +99,7 @@ def set_cpus_per_rs(self, cpus_per_rs: int) -> None: if self.colocated_db_settings: db_cpus = int(self.colocated_db_settings.get("db_cpus", 0)) if not db_cpus: - raise ValueError( - "db_cpus must be configured on colocated_db_settings" - ) + raise ValueError("db_cpus must be configured on colocated_db_settings") if cpus_per_rs < db_cpus: raise ValueError( diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py index 4a3a5afc4..5b6b520e3 100644 --- a/smartsim/settings/mpiSettings.py +++ b/smartsim/settings/mpiSettings.py @@ -81,6 +81,7 @@ def __init__( **kwargs, ) self.mpmd: t.List[RunSettings] = [] + self.affinity_script: t.List[str] = [] if not shutil.which(self._run_command): msg = ( diff --git a/smartsim/settings/mpirunSettings.py b/smartsim/settings/mpirunSettings.py index f061dc2a7..b290e2355 100644 --- a/smartsim/settings/mpirunSettings.py +++ b/smartsim/settings/mpirunSettings.py @@ -27,6 +27,7 @@ from warnings import simplefilter, warn from ..log import get_logger + # pylint: disable-next=unused-import from .mpiSettings import MpiexecSettings, MpirunSettings, OrterunSettings diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index bee381a84..bcfee1ff1 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -182,6 +182,16 @@ def set_walltime(self, walltime: str) -> None: """ logger.warning("set_walltime not supported under PALS") + def set_gpu_affinity_script(self, affinity: str, *args: t.Any) -> None: + """Set the GPU affinity through a bash script + + :param affinity: path to the affinity script + :type affinity: str + """ + self.affinity_script.append(str(affinity)) + for arg in args: + self.affinity_script.append(str(arg)) + def format_run_args(self) -> t.List[str]: """Return a list of MPI-standard formatted run arguments @@ -199,6 +209,10 @@ def format_run_args(self) -> t.List[str]: args += [prefix + opt] else: args += [prefix + opt, str(value)] + + if self.affinity_script: + args += self.affinity_script + return args def format_env_vars(self) -> t.List[str]: @@ -221,3 +235,20 @@ def format_env_vars(self) -> t.List[str]: formatted += ["--envlist", ",".join(export_vars)] return formatted + + def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: + """Set the hostlist for the PALS ``mpiexec`` command + + This sets ``--hosts`` + + :param host_list: list of host names + :type host_list: str | list[str] + :raises TypeError: if not str or list of str + """ + if isinstance(host_list, str): + host_list = [host_list.strip()] + if not isinstance(host_list, list): + raise TypeError("host_list argument must be a list of strings") + if not all(isinstance(host, str) for host in host_list): + raise TypeError("host_list argument must be list of strings") + self.run_args["hosts"] = ",".join(host_list) diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index e5e7f30e4..0a4b0868a 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -26,10 +26,12 @@ import typing as t -from .._core.utils import init_default -from ..error import SmartSimError +from ..error import SSConfigError +from ..log import get_logger from .base import BatchSettings +logger = get_logger(__name__) + class QsubBatchSettings(BatchSettings): def __init__( @@ -39,7 +41,7 @@ def __init__( time: t.Optional[str] = None, queue: t.Optional[str] = None, account: t.Optional[str] = None, - resources: t.Optional[t.Dict[str, t.Optional[str]]] = None, + resources: t.Optional[t.Dict[str, t.Union[str, int]]] = None, batch_args: t.Optional[t.Dict[str, t.Optional[str]]] = None, **kwargs: t.Any, ): @@ -66,10 +68,18 @@ def __init__( :param batch_args: overrides for PBS batch arguments, defaults to None :type batch_args: dict[str, str], optional """ - self._time: t.Optional[str] = None - self._nodes: t.Optional[int] = None + self._ncpus = ncpus + self.resources = resources or {} + resource_nodes = self.resources.get("nodes", None) + + if nodes and resource_nodes: + raise ValueError( + "nodes was incorrectly specified as a constructor parameter and also " + "as a key in the resource mapping" + ) + # time, queue, nodes, and account set in parent class init super().__init__( "qsub", @@ -80,20 +90,33 @@ def __init__( time=time, **kwargs, ) - self.resources = init_default({}, resources, dict) + self._hosts: t.List[str] = [] + @property + def resources(self) -> t.Dict[str, t.Union[str, int]]: + return self._resources.copy() + + @resources.setter + def resources(self, resources: t.Dict[str, t.Union[str, int]]) -> None: + self._sanity_check_resources(resources) + self._resources = resources.copy() + def set_nodes(self, num_nodes: int) -> None: """Set the number of nodes for this batch job - If a select argument is provided in ``QsubBatchSettings.resources`` - this value will be overridden + In PBS, 'select' is the more primitive way of describing how + many nodes to allocate for the job. 'nodes' is equivalent to + 'select' with a 'place' statement. Assuming that only advanced + users would use 'set_resource' instead, defining the number of + nodes here is sets the 'nodes' resource. :param num_nodes: number of nodes :type num_nodes: int """ + if num_nodes: - self._nodes = int(num_nodes) + self.set_resource("nodes", num_nodes) def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job @@ -123,7 +146,7 @@ def set_walltime(self, walltime: str) -> None: :type walltime: str """ if walltime: - self._time = walltime + self.set_resource("walltime", walltime) def set_queue(self, queue: str) -> None: """Set the queue for the batch job @@ -155,7 +178,7 @@ def set_account(self, account: str) -> None: if account: self.batch_args["A"] = str(account) - def set_resource(self, resource_name: str, value: str) -> None: + def set_resource(self, resource_name: str, value: t.Union[str, int]) -> None: """Set a resource value for the Qsub batch If a select statement is provided, the nodes and ncpus @@ -168,7 +191,10 @@ def set_resource(self, resource_name: str, value: str) -> None: """ # TODO add error checking here # TODO include option to overwrite place (warning for orchestrator?) - self.resources[resource_name] = value + updated_dict = self.resources + updated_dict.update({resource_name: value}) + self._sanity_check_resources(updated_dict) + self.resources = updated_dict def format_batch_args(self) -> t.List[str]: """Get the formatted batch arguments for a preview @@ -185,40 +211,69 @@ def format_batch_args(self) -> t.List[str]: opts += [" ".join((prefix + opt, str(value)))] return opts + def _sanity_check_resources( + self, resources: t.Optional[t.Dict[str, t.Union[str, int]]] = None + ) -> None: + """Check that only select or nodes was specified in resources + + Note: For PBS Pro, nodes is equivalent to 'select' and 'place' so + they are not quite synonyms. Here we assume that + """ + # Note: isinstance check here to avoid collision with default + checked_resources = resources if isinstance(resources, dict) else self.resources + + has_select = checked_resources.get("select", None) + has_nodes = checked_resources.get("nodes", None) + + if has_select and has_nodes: + raise SSConfigError( + "'select' and 'nodes' cannot both be specified. This can happen " + "if nodes were specified using the 'set_nodes' method and " + "'select' was set using 'set_resource'. Please only specify one." + ) + + if has_select and not isinstance(has_select, int): + raise TypeError("The value for 'select' must be an integer") + if has_nodes and not isinstance(has_nodes, int): + raise TypeError("The value for 'nodes' must be an integer") + + for key, value in checked_resources.items(): + if not isinstance(key, str): + raise TypeError( + f"The type of {key=} is {type(key)}. Only int and str " + "are allowed." + ) + if not isinstance(value, (str, int)): + raise TypeError( + f"The value associated with {key=} is {type(value)}. Only int " + "and str are allowed." + ) + def _create_resource_list(self) -> t.List[str]: + self._sanity_check_resources() res = [] - # get select statement from resources or kwargs - if "select" in self.resources: - res += [f"-l select={str(self.resources['select'])}"] - else: - select = "-l select=" - if self._nodes: - select += str(self._nodes) - else: - raise SmartSimError( - "Insufficient resource specification: no nodes or select statement" - ) - if self._ncpus: - select += f":ncpus={self._ncpus}" - if self._hosts: - hosts = ["=".join(("host", str(host))) for host in self._hosts] - select += f":{'+'.join(hosts)}" - res += [select] - - if "place" in self.resources: - res += [f"-l place={str(self.resources['place'])}"] - else: - res += ["-l place=scatter"] + # Pop off some specific keywords that need to be treated separately + resources = self.resources # Note this is a copy so not modifying original - # get time from resources or kwargs - if "walltime" in self.resources: - res += [f"-l walltime={str(self.resources['walltime'])}"] + # Construct the basic select/nodes statement + if select := resources.pop("select", None): + select_command = f"-l select={select}" + elif nodes := resources.pop("nodes", None): + select_command = f"-l nodes={nodes}" else: - if self._time: - res += [f"-l walltime={self._time}"] + raise SSConfigError( + "Insufficient resource specification: no nodes or select statement" + ) + if self._ncpus: + select_command += f":ncpus={self._ncpus}" + if self._hosts: + hosts = ["=".join(("host", str(host))) for host in self._hosts] + select_command += f":{'+'.join(hosts)}" + res += [select_command] + + # All other "standard" resource specs + for resource, value in resources.items(): + res += [f"-l {resource}={value}"] - for resource, value in self.resources.items(): - if resource not in ["select", "walltime", "place"]: - res += [f"-l {resource}={str(value)}"] return res diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index c3ee98593..b09286e8c 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -28,23 +28,23 @@ from .._core.utils.helpers import is_valid_cmd from ..error import SmartSimError -from ..wlm import detect_launcher from ..settings import ( - base, - CobaltBatchSettings, - QsubBatchSettings, - SbatchSettings, + AprunSettings, BsubBatchSettings, + CobaltBatchSettings, Container, - RunSettings, - AprunSettings, - SrunSettings, - MpirunSettings, + JsrunSettings, MpiexecSettings, + MpirunSettings, OrterunSettings, - JsrunSettings, PalsMpiexecSettings, + QsubBatchSettings, + RunSettings, + SbatchSettings, + SrunSettings, + base, ) +from ..wlm import detect_launcher _TRunSettingsSelector = t.Callable[[str], t.Callable[..., RunSettings]] diff --git a/smartsim/settings/slurmSettings.py b/smartsim/settings/slurmSettings.py index 60280fce9..8da8659e1 100644 --- a/smartsim/settings/slurmSettings.py +++ b/smartsim/settings/slurmSettings.py @@ -336,7 +336,7 @@ def check_env_vars(self) -> None: # If a variable is defined, it will take precedence over --export # we warn the user preexisting_var = os.environ.get(k, None) - if preexisting_var is not None: + if preexisting_var is not None and preexisting_var != v: msg = ( f"Variable {k} is set to {preexisting_var} in current " "environment. If the job is running in an interactive " diff --git a/smartsim/status.py b/smartsim/status.py index 4d1749e71..74d440b8e 100644 --- a/smartsim/status.py +++ b/smartsim/status.py @@ -32,6 +32,7 @@ STATUS_FAILED = "Failed" STATUS_NEW = "New" STATUS_PAUSED = "Paused" +STATUS_NEVER_STARTED = "NeverStarted" # SmartSim status mapping SMARTSIM_STATUS = { @@ -41,6 +42,7 @@ "Cancelled": STATUS_CANCELLED, "Failed": STATUS_FAILED, "New": STATUS_NEW, + "NeverStarted": STATUS_NEVER_STARTED, } # Status groupings diff --git a/smartsim/wlm/__init__.py b/smartsim/wlm/__init__.py index 8f3b1b097..d7dd298be 100644 --- a/smartsim/wlm/__init__.py +++ b/smartsim/wlm/__init__.py @@ -25,9 +25,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +import typing as t from shutil import which from subprocess import run -import typing as t from ..error import SSUnsupportedError from . import pbs as _pbs diff --git a/smartsim/wlm/slurm.py b/smartsim/wlm/slurm.py index 8fe12b3f9..ba46fb64c 100644 --- a/smartsim/wlm/slurm.py +++ b/smartsim/wlm/slurm.py @@ -237,7 +237,8 @@ def _get_alloc_cmd( options: t.Optional[t.Dict[str, str]] = None, ) -> t.List[str]: """Return the command to request an allocation from Slurm with - the class variables as the slurm options.""" + the class variables as the slurm options. + """ salloc_args = [ "--no-shell", diff --git a/tests/backends/run_sklearn_onnx.py b/tests/backends/run_sklearn_onnx.py index a2aa2aaca..d4377bbde 100644 --- a/tests/backends/run_sklearn_onnx.py +++ b/tests/backends/run_sklearn_onnx.py @@ -45,7 +45,6 @@ def build_lin_reg(): def build_kmeans(): - X = np.arange(20, dtype=np.float32).reshape(10, 2) tr = KMeans(n_clusters=2) tr.fit(X) @@ -76,7 +75,6 @@ def run_model(client, model_name, device, model, model_input, in_name, out_names def run(device): - # connect a client to the database client = Client(cluster=False) diff --git a/tests/backends/run_tf.py b/tests/backends/run_tf.py index f32dacb99..c9cf0ee04 100644 --- a/tests/backends/run_tf.py +++ b/tests/backends/run_tf.py @@ -34,7 +34,6 @@ def create_tf_mnist_model(): - model = keras.Sequential( layers=[ keras.layers.InputLayer(input_shape=(28, 28), name="input"), @@ -53,7 +52,6 @@ def create_tf_mnist_model(): def run(device): - model = create_tf_mnist_model() client = Client(cluster=False) diff --git a/tests/backends/run_torch.py b/tests/backends/run_torch.py index 2ea808189..d57cfad9d 100644 --- a/tests/backends/run_torch.py +++ b/tests/backends/run_torch.py @@ -75,7 +75,6 @@ def calc_svd(input_tensor): def run(device): - # connect a client to the database client = Client(cluster=False) diff --git a/tests/backends/test_cli_mini_exp.py b/tests/backends/test_cli_mini_exp.py index b3897c73b..7c793e915 100644 --- a/tests/backends/test_cli_mini_exp.py +++ b/tests/backends/test_cli_mini_exp.py @@ -27,15 +27,28 @@ from contextlib import contextmanager +import pytest import smartredis import smartsim._core._cli.validate from smartsim._core.utils.helpers import installed_redisai_backends +sklearn_available = True +try: + from skl2onnx import to_onnx + from sklearn.cluster import KMeans + from sklearn.datasets import load_iris + from sklearn.ensemble import RandomForestRegressor + from sklearn.linear_model import LinearRegression + from sklearn.model_selection import train_test_split + +except ImportError: + sklearn_available = False + def test_cli_mini_exp_doesnt_error_out_with_dev_build( local_db, - fileutils, + test_dir, monkeypatch, ): """Presumably devs running the test suite have built SS correctly. @@ -45,8 +58,8 @@ def test_cli_mini_exp_doesnt_error_out_with_dev_build( @contextmanager def _mock_make_managed_local_orc(*a, **kw): - client_addr ,= local_db.get_address() - yield smartredis.Client(address=client_addr, cluster=False) + (client_addr,) = local_db.get_address() + yield smartredis.Client(False, address=client_addr) monkeypatch.setattr( smartsim._core._cli.validate, @@ -54,17 +67,17 @@ def _mock_make_managed_local_orc(*a, **kw): _mock_make_managed_local_orc, ) backends = installed_redisai_backends() - db_port ,= local_db.ports + (db_port,) = local_db.ports smartsim._core._cli.validate.test_install( # Shouldn't matter bc we are stubbing creation of orc # but best to give it "correct" vals for safety - location=fileutils.get_test_dir(), + location=test_dir, port=db_port, # Always test on CPU, heads don't always have GPU device="CPU", # Test the backends the dev has installed with_tf="tensorflow" in backends, with_pt="torch" in backends, - with_onnx="onnxruntime" in backends, + with_onnx="onnxruntime" in backends and sklearn_available, ) diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index 5dd1fe4ed..37c4296ef 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -33,18 +33,32 @@ from smartsim.database import Orchestrator from smartsim.error.errors import SSInternalError from smartsim.experiment import Experiment +from smartsim.log import get_logger from smartsim.ml.data import DataInfo, TrainingDataUploader from smartsim.status import STATUS_COMPLETED +logger = get_logger(__name__) + shouldrun_tf = True if shouldrun_tf: try: + import tensorflow as tf from tensorflow import keras from smartsim.ml.tf import DynamicDataGenerator as TFDataGenerator from smartsim.ml.tf import StaticDataGenerator as TFStaticDataGenerator except: shouldrun_tf = False + else: + if pytest.test_device == "GPU": + try: + for device in tf.config.list_physical_devices("GPU"): + tf.config.set_logical_device_configuration( + device, + [tf.config.LogicalDeviceConfiguration(memory_limit=5_000)], + ) + except Exception: + logger.warning("Could not set TF max memory limit for GPU") shouldrun_torch = True if shouldrun_torch: @@ -52,10 +66,8 @@ import torch from smartsim.ml.torch import DataLoader - from smartsim.ml.torch import \ - DynamicDataGenerator as TorchDataGenerator - from smartsim.ml.torch import \ - StaticDataGenerator as TorchStaticDataGenerator + from smartsim.ml.torch import DynamicDataGenerator as TorchDataGenerator + from smartsim.ml.torch import StaticDataGenerator as TorchStaticDataGenerator except: shouldrun_torch = False @@ -155,9 +167,10 @@ def train_tf(generator): @pytest.mark.skipif(not shouldrun_tf, reason="Test needs TensorFlow to run") -def test_tf_dataloaders(fileutils, wlmutils): - test_dir = fileutils.make_test_dir() - exp = Experiment("test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher()) +def test_tf_dataloaders(test_dir, wlmutils): + exp = Experiment( + "test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher() + ) orc: Orchestrator = wlmutils.get_orchestrator() exp.generate(orc) exp.start(orc) @@ -221,9 +234,10 @@ def create_trainer_torch(experiment: Experiment, filedir, wlmutils): @pytest.mark.skipif(not shouldrun_torch, reason="Test needs Torch to run") -def test_torch_dataloaders(fileutils, wlmutils): - test_dir = fileutils.make_test_dir() - exp = Experiment("test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher()) +def test_torch_dataloaders(fileutils, test_dir, wlmutils): + exp = Experiment( + "test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher() + ) orc: Orchestrator = wlmutils.get_orchestrator() config_dir = fileutils.get_test_dir_path("ml") exp.generate(orc) @@ -271,10 +285,10 @@ def test_torch_dataloaders(fileutils, wlmutils): for _ in range(2): for _ in torch_static: continue - + trainer = create_trainer_torch(exp, config_dir, wlmutils) exp.start(trainer, block=True) - + assert exp.get_status(trainer)[0] == STATUS_COMPLETED except Exception as e: @@ -317,9 +331,12 @@ def test_data_info_repr(): @pytest.mark.skipif( not (shouldrun_torch or shouldrun_tf), reason="Requires TF or PyTorch" ) -def test_wrong_dataloaders(fileutils, wlmutils): - test_dir = fileutils.make_test_dir() - exp = Experiment("test-wrong-dataloaders", exp_path=test_dir, launcher=wlmutils.get_test_launcher()) +def test_wrong_dataloaders(test_dir, wlmutils): + exp = Experiment( + "test-wrong-dataloaders", + exp_path=test_dir, + launcher=wlmutils.get_test_launcher(), + ) orc = wlmutils.get_orchestrator() exp.generate(orc) exp.start(orc) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 83a2e119b..1cfc1efcb 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -26,18 +26,16 @@ import sys -import time import pytest -import smartsim from smartsim import Experiment, status from smartsim._core.utils import installed_redisai_backends +from smartsim.entity import Ensemble +from smartsim.entity.dbobject import DBModel from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger -from smartsim.entity.dbobject import DBModel - logger = get_logger(__name__) should_run_tf = True @@ -45,8 +43,10 @@ # Check TensorFlow is available for tests try: - import tensorflow.keras as keras + import tensorflow as tf + from tensorflow import keras from tensorflow.keras.layers import Conv2D, Input + except ImportError: should_run_tf = False else: @@ -60,6 +60,14 @@ def call(self, x): y = self.conv(x) return y + if pytest.test_device == "GPU": + try: + for device in tf.config.list_physical_devices("GPU"): + tf.config.set_logical_device_configuration( + device, [tf.config.LogicalDeviceConfiguration(memory_limit=5_000)] + ) + except: + logger.warning("Could not set TF max memory limit for GPU") should_run_tf &= "tensorflow" in installed_redisai_backends() @@ -136,7 +144,7 @@ def save_torch_cnn(path, file_name): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_tf_db_model(fileutils, wlmutils, mlutils): +def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): """Test TensorFlow DB Models on remote DB""" # Set experiment name @@ -147,8 +155,8 @@ def test_tf_db_model(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = 1 # TF backend fails on multiple GPUs + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -157,14 +165,14 @@ def test_tf_db_model(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) # Create database - db = exp.create_database(port=test_port, interface=test_interface) + host = wlmutils.choose_host(run_settings) + db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) # Create and save ML model to filesystem @@ -178,6 +186,7 @@ def test_tf_db_model(fileutils, wlmutils, mlutils): model=model, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs, outputs=outputs, tag="test", @@ -188,6 +197,7 @@ def test_tf_db_model(fileutils, wlmutils, mlutils): model_path=model_file2, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs2, outputs=outputs2, tag="test", @@ -200,17 +210,21 @@ def test_tf_db_model(fileutils, wlmutils, mlutils): # Assert we have added both models assert len(smartsim_model._db_models) == 2 + exp.generate(smartsim_model) + # Launch and check successful completion try: exp.start(db, smartsim_model, block=True) statuses = exp.get_status(smartsim_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" finally: exp.stop(db) @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_pt_db_model(fileutils, wlmutils, mlutils): +def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): """Test PyTorch DB Models on remote DB""" # Set experiment name @@ -221,8 +235,8 @@ def test_pt_db_model(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -231,14 +245,14 @@ def test_pt_db_model(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) # Create database - db = exp.create_database(port=test_port, interface=test_interface) + host = wlmutils.choose_host(run_settings) + db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) # Create and save ML model to filesystem @@ -252,6 +266,7 @@ def test_pt_db_model(fileutils, wlmutils, mlutils): model_path=model_path, device=test_device, devices_per_node=test_num_gpus, + first_device=0, tag="test", ) @@ -259,21 +274,24 @@ def test_pt_db_model(fileutils, wlmutils, mlutils): for db_model in smartsim_model._db_models: logger.debug(db_model) - # Assert we have added both models assert len(smartsim_model._db_models) == 1 + exp.generate(smartsim_model) + # Launch and check successful completion try: exp.start(db, smartsim_model, block=True) statuses = exp.get_status(smartsim_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" finally: exp.stop(db) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_db_model_ensemble(fileutils, wlmutils, mlutils): +def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): """Test DBModels on remote DB, with an ensemble""" # Set experiment name @@ -284,8 +302,8 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = 1 # TF backend fails on multiple GPUs + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -294,20 +312,19 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create ensemble smartsim_ensemble = exp.create_ensemble( "smartsim_model", run_settings=run_settings, replicas=2 ) - smartsim_ensemble.set_path(test_dir) # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) # Create database - db = exp.create_database(port=test_port, interface=test_interface) + host = wlmutils.choose_host(run_settings) + db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) # Create and save ML model to filesystem @@ -321,8 +338,9 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): model=model, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs, - outputs=outputs + outputs=outputs, ) # Add the second ML model individually to each SmartSim model @@ -334,6 +352,7 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): model_path=model_file2, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs2, outputs=outputs2, ) @@ -342,13 +361,14 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): smartsim_ensemble.add_model(smartsim_model) # Add the second ML model to the newly added entity. This is - # because the test script run both ML models for all entities. + # because the test script runs both ML models for all entities. smartsim_model.add_ml_model( "cnn2", "TF", model_path=model_file2, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs2, outputs=outputs2, ) @@ -358,17 +378,21 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): # Assert we have added two models to each entity assert all([len(entity._db_models) == 2 for entity in smartsim_ensemble]) + exp.generate(smartsim_ensemble) + # Launch and check successful completion try: exp.start(db, smartsim_ensemble, block=True) statuses = exp.get_status(smartsim_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" finally: exp.stop(db) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): +def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): """Test DB Models on colocated DB (TensorFlow backend)""" # Set experiment name @@ -379,26 +403,22 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = 1 # TF backend fails on multiple GPUs + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experience - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create colocated Model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( - port=test_port, - db_cpus=1, - debug=True, - ifname=test_interface + port=test_port, db_cpus=1, debug=True, ifname=test_interface ) # Create and save ML model to filesystem @@ -412,8 +432,9 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): model_path=model_file, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs, - outputs=outputs + outputs=outputs, ) colo_model.add_ml_model( "cnn2", @@ -421,6 +442,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): model_path=model_file2, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs2, outputs=outputs2, ) @@ -428,16 +450,21 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): # Assert we have added both models assert len(colo_model._db_models) == 2 + exp.generate(colo_model) + # Launch and check successful completion try: exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" finally: exp.stop(colo_model) + @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): +def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): """Test DB Models on colocated DB (PyTorch backend)""" # Set experiment name @@ -448,26 +475,22 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") # Create the SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( - port=test_port, - db_cpus=1, - debug=True, - ifname=test_interface + port=test_port, db_cpus=1, debug=True, ifname=test_interface ) # Create and save ML model to filesystem @@ -475,26 +498,33 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): model_file = test_dir + "/model1.pt" # Add the ML model to the SmartSim Model - colo_model.add_ml_model("cnn", - "TORCH", - model_path=model_file, - device=test_device, - devices_per_node=test_num_gpus) + colo_model.add_ml_model( + "cnn", + "TORCH", + model_path=model_file, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) # Assert we have added both models assert len(colo_model._db_models) == 1 + exp.generate(colo_model) + # Launch and check successful completion try: exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" finally: exp.stop(colo_model) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): +def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): """Test DBModel on colocated ensembles, first colocating DB, then adding DBModel. """ @@ -507,8 +537,8 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = 1 # TF backend fails on multiple GPUs + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -517,22 +547,18 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): # Create RunSettings for colocated model colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create ensemble of two identical models - colo_ensemble = exp.create_ensemble( + colo_ensemble: Ensemble = exp.create_ensemble( "colocated_ens", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Create a third model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) colo_model.colocate_db_tcp( - port=test_port, - db_cpus=1, - debug=True, - ifname=test_interface + port=test_port, db_cpus=1, debug=True, ifname=test_interface ) # Create and save the ML models to the filesystem @@ -542,10 +568,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): # Colocate a database with the ensemble with two ensemble members for i, entity in enumerate(colo_ensemble): entity.colocate_db_tcp( - port=test_port + i + 1, - db_cpus=1, - debug=True, - ifname=test_interface + port=test_port + i + 1, db_cpus=1, debug=True, ifname=test_interface ) # Add ML model to each ensemble member individual to test that they # do not conflict with models add to the Ensemble object @@ -555,9 +578,11 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): model_path=model_file2, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs2, outputs=outputs2, ) + entity.disable_key_prefixing() # Test adding a model from Ensemble object colo_ensemble.add_ml_model( @@ -566,6 +591,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): model_path=model_file, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs, outputs=outputs, tag="test", @@ -581,21 +607,26 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): model_path=model_file2, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs2, outputs=outputs2, ) + exp.generate(colo_ensemble) + # Launch and check successful completion try: exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" finally: exp.stop(colo_ensemble) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): +def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): """Test DBModel on colocated ensembles, first adding the DBModel to the ensemble, then colocating DB. """ @@ -608,27 +639,25 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = 1 # TF backend fails on multiple GPUs + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create the ensemble of two identical SmartSim Model colo_ensemble = exp.create_ensemble( "colocated_ens", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) # Create and save ML model to filesystem model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") @@ -641,17 +670,15 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): model_path=model_file, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs, - outputs=outputs + outputs=outputs, ) # Colocate a database with the first ensemble members for i, entity in enumerate(colo_ensemble): entity.colocate_db_tcp( - port = test_port + i, - db_cpus=1, - debug=True, - ifname=test_interface + port=test_port + i, db_cpus=1, debug=True, ifname=test_interface ) # Add ML models to each ensemble member to make sure they # do not conflict with other ML models @@ -661,19 +688,21 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): model_path=model_file2, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs2, outputs=outputs2, ) + entity.disable_key_prefixing() # Add another ensemble member colo_ensemble.add_model(colo_model) # Colocate a database with the new ensemble member colo_model.colocate_db_tcp( - port=test_port + len(colo_ensemble), + port=test_port + len(colo_ensemble) - 1, db_cpus=1, debug=True, - ifname=test_interface + ifname=test_interface, ) # Add a ML model to the new ensemble member colo_model.add_ml_model( @@ -682,21 +711,26 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): model_path=model_file2, device=test_device, devices_per_node=test_num_gpus, + first_device=0, inputs=inputs2, outputs=outputs2, ) + exp.generate(colo_ensemble) + # Launch and check successful completion try: exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" finally: exp.stop(colo_ensemble) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): +def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): """Test error when colocated db model has no file.""" # Set experiment name @@ -707,26 +741,23 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = 1 # TF backend fails on multiple GPUs + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) colo_model.colocate_db_tcp( - port=test_port, - db_cpus=1, - debug=True, - ifname=test_interface + port=test_port, db_cpus=1, debug=True, ifname=test_interface ) # Get and save TF model @@ -736,40 +767,46 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): # are only supported for non-colocated deployments with pytest.raises(SSUnsupportedError): colo_model.add_ml_model( - "cnn", "TF", model=model, device=test_device, - devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs + "cnn", + "TF", + model=model, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=inputs, + outputs=outputs, ) # Create an ensemble with two identical replicas colo_ensemble = exp.create_ensemble( "colocated_ens", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Colocate a db with each ensemble member for i, entity in enumerate(colo_ensemble): entity.colocate_db_tcp( - port=test_port + i, - db_cpus=1, - debug=True, - ifname=test_interface + port=test_port + i, db_cpus=1, debug=True, ifname=test_interface ) # Check that an error is raised because in-memory models # are only supported for non-colocated deployments with pytest.raises(SSUnsupportedError): colo_ensemble.add_ml_model( - "cnn", "TF", model=model, device=test_device, - devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs + "cnn", + "TF", + model=model, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=inputs, + outputs=outputs, ) # Check error is still thrown if an in-memory model is used # with a colocated deployment. This test varies by adding # the SmartSIm model with a colocated database to the ensemble # after the ML model was been added to the ensemble. - colo_settings2 = exp.create_run_settings( - exe=sys.executable, exe_args=test_script - ) + colo_settings2 = exp.create_run_settings(exe=sys.executable, exe_args=test_script) # Reverse order of DBModel and model colo_ensemble2 = exp.create_ensemble( @@ -777,8 +814,14 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): ) colo_ensemble2.set_path(test_dir) colo_ensemble2.add_ml_model( - "cnn", "TF", model=model, device=test_device, - devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs + "cnn", + "TF", + model=model, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=inputs, + outputs=outputs, ) for i, entity in enumerate(colo_ensemble2): with pytest.raises(SSUnsupportedError): @@ -792,24 +835,26 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): with pytest.raises(SSUnsupportedError): colo_ensemble.add_model(colo_model) + @pytest.mark.skipif(not should_run_tf, reason="Test needs TensorFlow to run") def test_inconsistent_params_db_model(): """Test error when devices_per_node parameter>1 when devices is set to CPU in DBModel""" - + # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() with pytest.raises(SSUnsupportedError) as ex: - db_model = DBModel( + DBModel( "cnn", "TF", model=model, device="CPU", devices_per_node=2, + first_device=0, tag="test", inputs=inputs, outputs=outputs, ) assert ( - ex.value.args[0] - == "Cannot set devices_per_node>1 if CPU is specified under devices" - ) + ex.value.args[0] + == "Cannot set devices_per_node>1 if CPU is specified under devices" + ) diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index 06492f60f..e6cacd4d0 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -24,21 +24,25 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os import sys import pytest +from smartredis import * from smartsim import Experiment, status from smartsim._core.utils import installed_redisai_backends +from smartsim.entity.dbobject import DBScript from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger - -from smartsim.entity.dbobject import DBScript +from smartsim.settings import MpiexecSettings, MpirunSettings logger = get_logger(__name__) should_run = True +supported_dbs = ["uds", "tcp"] + try: import torch except ImportError: @@ -52,7 +56,7 @@ def timestwo(x): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script(fileutils, wlmutils, mlutils): +def test_db_script(fileutils, test_dir, wlmutils, mlutils): """Test DB scripts on remote DB""" # Set experiment name @@ -63,8 +67,8 @@ def test_db_script(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -74,15 +78,15 @@ def test_db_script(fileutils, wlmutils, mlutils): # Create the RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) # Create the SmartSim database - db = exp.create_database(port=test_port, interface=test_interface) - exp.generate(db) + host = wlmutils.choose_host(run_settings) + db = exp.create_database(port=test_port, interface=test_interface, hosts=host) + exp.generate(db, smartsim_model) # Define the torch script string torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" @@ -92,7 +96,8 @@ def test_db_script(fileutils, wlmutils, mlutils): "test_script1", script_path=torch_script, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Add script via string @@ -100,7 +105,8 @@ def test_db_script(fileutils, wlmutils, mlutils): "test_script2", script=torch_script_str, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Add script function @@ -108,7 +114,8 @@ def test_db_script(fileutils, wlmutils, mlutils): "test_func", function=timestwo, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Assert we have all three scripts @@ -124,7 +131,7 @@ def test_db_script(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_ensemble(fileutils, wlmutils, mlutils): +def test_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): """Test DB scripts on remote DB""" # Set experiment name @@ -135,8 +142,8 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -146,20 +153,19 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create Ensemble with two identical models ensemble = exp.create_ensemble( "dbscript_ensemble", run_settings=run_settings, replicas=2 ) - ensemble.set_path(test_dir) # Create SmartSim model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) # Create SmartSim database - db = exp.create_database(port=test_port, interface=test_interface) + host = wlmutils.choose_host(run_settings) + db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) # Create the script string @@ -170,7 +176,8 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): "test_script1", script_path=torch_script, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Add script via string for each ensemble entity @@ -180,7 +187,8 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): "test_script2", script=torch_script_str, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Add script via function @@ -188,7 +196,8 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): "test_func", function=timestwo, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Add an additional ensemble member and attach a script to the new member @@ -197,7 +206,8 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): "test_script2", script=torch_script_str, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Assert we have added both models to the ensemble @@ -206,6 +216,8 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): # Assert we have added all three models to entities in ensemble assert all([len(entity._db_scripts) == 3 for entity in ensemble]) + exp.generate(ensemble) + try: exp.start(db, ensemble, block=True) statuses = exp.get_status(ensemble) @@ -215,7 +227,7 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script(fileutils, wlmutils, mlutils): +def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): """Test DB Scripts on colocated DB""" # Set the experiment name @@ -226,27 +238,23 @@ def test_colocated_db_script(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") # Create the SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create model with colocated database colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( - port=test_port, - db_cpus=1, - debug=True, - ifname=test_interface, + port=test_port, db_cpus=1, debug=True, ifname=test_interface ) # Create string for script creation @@ -257,19 +265,23 @@ def test_colocated_db_script(fileutils, wlmutils, mlutils): "test_script1", script_path=torch_script, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Add script via string colo_model.add_script( "test_script2", script=torch_script_str, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Assert we have added both models assert len(colo_model._db_scripts) == 2 + exp.generate(colo_model) + for db_script in colo_model._db_scripts: logger.debug(db_script) @@ -282,7 +294,7 @@ def test_colocated_db_script(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): +def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): """Test DB Scripts on colocated DB from ensemble, first colocating DB, then adding script. """ @@ -295,28 +307,26 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create SmartSim Ensemble with two identical models colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Create a SmartSim model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) # Colocate a db with each ensemble entity and add a script # to each entity via file @@ -333,7 +343,8 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): "test_script1", script_path=torch_script, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Colocate a db with the non-ensemble Model @@ -350,7 +361,8 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): "test_script2", script=torch_script_str, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Add the third SmartSim model to the ensemble @@ -361,7 +373,8 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): "test_script1", script_path=torch_script, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Assert we have added one model to the ensemble @@ -369,6 +382,8 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): # Assert we have added both models to each entity assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) + exp.generate(colo_ensemble) + # Launch and check successful completion try: exp.start(colo_ensemble, block=True) @@ -379,7 +394,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): +def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): """Test DB Scripts on colocated DB from ensemble, first adding the script to the ensemble, then colocating the DB""" @@ -391,28 +406,26 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create Ensemble with two identical SmartSim Model colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Create an additional SmartSim Model entity colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) # Add a script via string to the ensemble members torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" @@ -420,7 +433,8 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): "test_script2", script=torch_script_str, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Add a colocated database to the ensemble members @@ -438,7 +452,8 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): "test_script1", script_path=torch_script, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Add a colocated database to the non-ensemble SmartSim Model @@ -446,7 +461,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): port=test_port + len(colo_ensemble), db_cpus=1, debug=True, - ifname=test_interface + ifname=test_interface, ) # Add the non-ensemble SmartSim Model to the Ensemble @@ -456,7 +471,8 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): "test_script1", script_path=torch_script, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Assert we have added one model to the ensemble @@ -464,6 +480,8 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): # Assert we have added both models to each entity assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) + exp.generate(colo_ensemble) + # Launch and check successful completion try: exp.start(colo_ensemble, block=True) @@ -474,7 +492,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_errors(fileutils, wlmutils, mlutils): +def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): """Test DB Scripts error when setting a serialized function on colocated DB""" # Set Experiment name @@ -485,22 +503,20 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") - torch_script = fileutils.get_test_conf_path("torchscript.py") # Create SmartSim experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create a SmartSim model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( port=test_port, db_cpus=1, @@ -515,7 +531,8 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): "test_func", function=timestwo, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Create ensemble with two identical SmartSim Model entities @@ -523,7 +540,6 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Add a colocated database for each ensemble member for i, entity in enumerate(colo_ensemble): @@ -541,7 +557,8 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): "test_func", function=timestwo, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Create an ensemble with two identical SmartSim Model entities @@ -549,14 +566,14 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Add an in-memory function to the ensemble colo_ensemble.add_function( "test_func", function=timestwo, device=test_device, - devices_per_node=test_num_gpus + devices_per_node=test_num_gpus, + first_device=0, ) # Check that an error is raised when trying to add @@ -576,19 +593,33 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): # an in-memory script with pytest.raises(SSUnsupportedError): colo_ensemble.add_model(colo_model) - + + def test_inconsistent_params_db_script(fileutils): """Test error when devices_per_node>1 and when devices is set to CPU in DBScript constructor""" torch_script = fileutils.get_test_conf_path("torchscript.py") with pytest.raises(SSUnsupportedError) as ex: - db_script = DBScript( + _ = DBScript( name="test_script_db", - script_path = torch_script, + script_path=torch_script, device="CPU", devices_per_node=2, + first_device=0, ) assert ( - ex.value.args[0] - == "Cannot set devices_per_node>1 if CPU is specified under devices" - ) \ No newline at end of file + ex.value.args[0] + == "Cannot set devices_per_node>1 if CPU is specified under devices" + ) + with pytest.raises(SSUnsupportedError) as ex: + _ = DBScript( + name="test_script_db", + script_path=torch_script, + device="CPU", + devices_per_node=1, + first_device=5, + ) + assert ( + ex.value.args[0] + == "Cannot set first_device>0 if CPU is specified under devices" + ) diff --git a/tests/backends/test_onnx.py b/tests/backends/test_onnx.py index 312d56953..19c40017e 100644 --- a/tests/backends/test_onnx.py +++ b/tests/backends/test_onnx.py @@ -56,7 +56,7 @@ ) -def test_sklearn_onnx(fileutils, mlutils, wlmutils): +def test_sklearn_onnx(test_dir, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 some sklearn models here we test the following sklearn models: @@ -75,7 +75,7 @@ def test_sklearn_onnx(fileutils, mlutils, wlmutils): """ exp_name = "test_sklearn_onnx" - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py index e30ad4f24..06c148a95 100644 --- a/tests/backends/test_tf.py +++ b/tests/backends/test_tf.py @@ -50,7 +50,7 @@ (not tf_backend_available) or (not tf_available), reason="Requires RedisAI TF backend", ) -def test_keras_model(fileutils, mlutils, wlmutils): +def test_keras_model(test_dir, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 for a keras model script this test can run on CPU/GPU by setting SMARTSIM_TEST_DEVICE=GPU @@ -61,7 +61,7 @@ def test_keras_model(fileutils, mlutils, wlmutils): """ exp_name = "test_keras_model" - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() @@ -91,7 +91,6 @@ def test_keras_model(fileutils, mlutils, wlmutils): def create_tf_model(): - model = keras.Sequential( layers=[ keras.layers.InputLayer(input_shape=(28, 28), name="input"), @@ -110,9 +109,7 @@ def create_tf_model(): @pytest.mark.skipif(not tf_available, reason="Requires Tensorflow and Keras") -def test_freeze_model(fileutils): - test_dir = fileutils.make_test_dir() - +def test_freeze_model(test_dir): model = create_tf_model() model_path, inputs, outputs = freeze_model(model, test_dir, "mnist.pb") assert len(inputs) == 1 diff --git a/tests/backends/test_torch.py b/tests/backends/test_torch.py index ba663a433..71a63adb9 100644 --- a/tests/backends/test_torch.py +++ b/tests/backends/test_torch.py @@ -48,7 +48,7 @@ ) -def test_torch_model_and_script(fileutils, mlutils, wlmutils): +def test_torch_model_and_script(test_dir, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 for a torch model script Here we test both the torchscipt API and the NN API from torch @@ -61,7 +61,7 @@ def test_torch_model_and_script(fileutils, mlutils, wlmutils): """ exp_name = "test_torch_model_and_script" - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 4beccd41b..2b7db11e1 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -29,23 +29,35 @@ import pytest from smartsim import Experiment, status +from smartsim.settings import QsubBatchSettings # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") +if (pytest.test_launcher == "pbs") and (not pytest.has_aprun): + pytestmark = pytest.mark.skip( + reason="Launching batch jobs is not supported on PBS without ALPS" + ) + + +def add_batch_resources(wlmutils, batch_settings): + if isinstance(batch_settings, QsubBatchSettings): + for key, value in wlmutils.get_batch_resources().items(): + batch_settings.set_resource(key, value) + -def test_batch_model(fileutils, wlmutils): +def test_batch_model(fileutils, test_dir, wlmutils): """Test the launch of a manually construced batch model""" exp_name = "test-batch-model" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, batch_settings) if wlmutils.get_test_launcher() == "cobalt": batch_settings.set_queue("debug-flat-quad") run_settings = wlmutils.get_run_settings("python", f"{script} --time=5") @@ -60,12 +72,11 @@ def test_batch_model(fileutils, wlmutils): assert statuses[0] == status.STATUS_COMPLETED -def test_batch_ensemble(fileutils, wlmutils): +def test_batch_ensemble(fileutils, test_dir, wlmutils): """Test the launch of a manually constructed batch ensemble""" exp_name = "test-batch-ensemble" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") @@ -73,6 +84,7 @@ def test_batch_ensemble(fileutils, wlmutils): M2 = exp.create_model("m2", path=test_dir, run_settings=settings) batch = exp.create_batch_settings(nodes=1, time="00:01:00") + add_batch_resources(wlmutils, batch) batch.set_account(wlmutils.get_test_account()) if wlmutils.get_test_launcher() == "cobalt": @@ -87,15 +99,15 @@ def test_batch_ensemble(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_batch_ensemble_replicas(fileutils, wlmutils): +def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): exp_name = "test-batch-ensemble-replicas" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") batch = exp.create_batch_settings(nodes=1, time="00:01:00") + add_batch_resources(wlmutils, batch) batch.set_account(wlmutils.get_test_account()) if wlmutils.get_test_launcher() == "cobalt": diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index 7e5591a30..f1f5952b3 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -35,19 +35,26 @@ if pytest.test_launcher not in pytest.wlm_options: pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") +if (pytest.test_launcher == "pbs") and (not pytest.has_aprun): + pytestmark = pytest.mark.skip( + reason="Launching orchestrators in a batch job is not supported on PBS without ALPS" + ) + -def test_launch_orc_auto_batch(fileutils, wlmutils): +def test_launch_orc_auto_batch(test_dir, wlmutils): """test single node orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-orc-batch" - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() orc = exp.create_database( - wlmutils.get_test_port(), batch=True, interface=network_interface, single_cmd=False + wlmutils.get_test_port(), + batch=True, + interface=network_interface, + single_cmd=False, ) orc.batch_settings.set_account(wlmutils.get_test_account()) @@ -55,7 +62,7 @@ def test_launch_orc_auto_batch(fileutils, wlmutils): orc.batch_settings.set_walltime("00:02:00") if wlmutils.get_test_launcher() == "cobalt": orc.batch_settings.set_queue("debug-flat-quad") - + orc.set_path(test_dir) exp.start(orc, block=True) @@ -71,19 +78,22 @@ def test_launch_orc_auto_batch(fileutils, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_batch_single(fileutils, wlmutils): +def test_launch_cluster_orc_batch_single(test_dir, wlmutils): """test clustered 3-node orchestrator with single command""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-batch-single" - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() orc = exp.create_database( - wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface, single_cmd=True + wlmutils.get_test_port(), + db_nodes=3, + batch=True, + interface=network_interface, + single_cmd=True, ) orc.batch_settings.set_account(wlmutils.get_test_account()) @@ -110,23 +120,26 @@ def test_launch_cluster_orc_batch_single(fileutils, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_batch_multi(fileutils, wlmutils): +def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): """test clustered 3-node orchestrator""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-batch-multi" - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() orc = exp.create_database( - wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface, single_cmd=False + wlmutils.get_test_port(), + db_nodes=3, + batch=True, + interface=network_interface, + single_cmd=False, ) - + orc.batch_settings.set_account(wlmutils.get_test_account()) - + orc.batch_settings.set_walltime("00:03:00") if wlmutils.get_test_launcher() == "cobalt": # As Cobalt won't allow us to run two @@ -149,16 +162,17 @@ def test_launch_cluster_orc_batch_multi(fileutils, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_reconnect(fileutils, wlmutils): +def test_launch_cluster_orc_reconnect(test_dir, wlmutils): """test reconnecting to clustered 3-node orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = "test-launch-cluster-orc-batch-reconect" - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database(wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface) + orc = exp.create_database( + wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface + ) orc.set_path(test_dir) orc.batch_settings.set_account(wlmutils.get_test_account()) diff --git a/tests/full_wlm/test_mpmd.py b/tests/full_wlm/test_mpmd.py index 19f4660c2..18e918cfd 100644 --- a/tests/full_wlm/test_mpmd.py +++ b/tests/full_wlm/test_mpmd.py @@ -36,7 +36,7 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_mpmd(fileutils, wlmutils): +def test_mpmd(fileutils, test_dir, wlmutils): """Run an MPMD model twice and check that it always gets executed the same way. @@ -61,7 +61,7 @@ def test_mpmd(fileutils, wlmutils): "cobalt": ["mpirun"], } - exp = Experiment(exp_name, launcher=launcher) + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) def prune_commands(launcher): available_commands = [] @@ -77,7 +77,6 @@ def prune_commands(launcher): f"MPMD on {launcher} only supported for run commands {by_launcher[launcher]}" ) - test_dir = fileutils.make_test_dir() for run_command in run_commands: script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings( diff --git a/tests/on_wlm/test_base_settings_on_wlm.py b/tests/on_wlm/test_base_settings_on_wlm.py index 3aa77983f..d75cc635f 100644 --- a/tests/on_wlm/test_base_settings_on_wlm.py +++ b/tests/on_wlm/test_base_settings_on_wlm.py @@ -40,10 +40,9 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_model_on_wlm(fileutils, wlmutils): +def test_model_on_wlm(fileutils, test_dir, wlmutils): exp_name = "test-base-settings-model-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") @@ -58,10 +57,9 @@ def test_model_on_wlm(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_model_stop_on_wlm(fileutils, wlmutils): +def test_model_stop_on_wlm(fileutils, test_dir, wlmutils): exp_name = "test-base-settings-model-stop" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index 6a3945115..fa05eb513 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -36,142 +36,158 @@ else: supported_dbs = ["uds", "tcp", "deprecated"] +# Set to true if DB logs should be generated for debugging +DEBUG_DB = False + # retrieved from pytest fixtures launcher = pytest.test_launcher if launcher not in pytest.wlm_options: pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") + @pytest.mark.parametrize("db_type", supported_dbs) -def test_launch_colocated_model_defaults(fileutils, coloutils, db_type): +def test_launch_colocated_model_defaults(fileutils, test_dir, coloutils, db_type): """Test the launch of a model with a colocated database and local launcher""" - db_args = { } + db_args = {"debug": DEBUG_DB} - exp = Experiment("colocated_model_defaults", launcher=launcher) + exp = Experiment("colocated_model_defaults", launcher=launcher, exp_path=test_dir) colo_model = coloutils.setup_test_colo( - fileutils, - db_type, - exp, - db_args, + fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True ) - + exp.generate(colo_model) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0" exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" # test restarting the colocated model exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): - exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher) +@pytest.mark.parametrize("db_type", supported_dbs) +def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, db_type): + exp = Experiment( + "colocated_model_pinning_auto_1cpu", launcher=launcher, exp_path=test_dir + ) db_args = { "db_cpus": 1, "custom_pinning": [], + "debug": DEBUG_DB, } # Check to make sure that the CPU mask was correctly generated colo_model = coloutils.setup_test_colo( - fileutils, - db_type, - exp, - db_args, + fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] is None + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type): - exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher) +@pytest.mark.parametrize("db_type", supported_dbs) +def test_colocated_model_pinning_auto_2cpu(fileutils, test_dir, coloutils, db_type): + exp = Experiment( + "colocated_model_pinning_auto_2cpu", + launcher=launcher, + exp_path=test_dir, + ) - db_args = { - "db_cpus": 2, - } + db_args = {"db_cpus": 2, "debug": DEBUG_DB} # Check to make sure that the CPU mask was correctly generated colo_model = coloutils.setup_test_colo( - fileutils, - db_type, - exp, - db_args, + fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" + @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_range(fileutils, coloutils, db_type): +def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # Assume that there are at least 4 cpus on the node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment( + "colocated_model_pinning_manual", + launcher=launcher, + exp_path=test_dir, + ) - db_args = { - "db_cpus": 4, - "custom_pinning": range(4) - } + db_args = {"db_cpus": 4, "custom_pinning": range(4), "debug": DEBUG_DB} colo_model = coloutils.setup_test_colo( - fileutils, - db_type, - exp, - db_args, + fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1,2,3" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" + @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_list(fileutils, coloutils, db_type): +def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this has more than 2 CPUs on the supercomputer node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment( + "colocated_model_pinning_manual", + launcher=launcher, + exp_path=test_dir, + ) - db_args = { - "db_cpus": 2, - "custom_pinning": [0,2] - } + db_args = {"db_cpus": 2, "custom_pinning": [0, 2]} colo_model = coloutils.setup_test_colo( - fileutils, - db_type, - exp, - db_args, + fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,2" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" + @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_mixed(fileutils, coloutils, db_type): +def test_colocated_model_pinning_mixed(fileutils, test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this at least 4 CPUs on the supercomputer node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment( + "colocated_model_pinning_manual", + launcher=launcher, + exp_path=test_dir, + ) - db_args = { - "db_cpus": 2, - "custom_pinning": [range(2), 3] - } + db_args = {"db_cpus": 2, "custom_pinning": [range(2), 3]} colo_model = coloutils.setup_test_colo( - fileutils, - db_type, - exp, - db_args, + fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1,3" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" diff --git a/tests/on_wlm/test_containers_wlm.py b/tests/on_wlm/test_containers_wlm.py index 414303df4..198a92f43 100644 --- a/tests/on_wlm/test_containers_wlm.py +++ b/tests/on_wlm/test_containers_wlm.py @@ -40,7 +40,7 @@ @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_wlm_smartredis(fileutils, wlmutils): +def test_singularity_wlm_smartredis(fileutils, test_dir, wlmutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. @@ -55,13 +55,14 @@ def test_singularity_wlm_smartredis(fileutils, wlmutils): f"Test only runs on systems with PBS or Slurm as WLM. Current launcher: {launcher}" ) - test_dir = fileutils.make_test_dir() exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher=launcher ) # create and start a database - orc = exp.create_database(port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface()) + orc = exp.create_database( + port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface() + ) exp.generate(orc) exp.start(orc, block=False) diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index 919317c73..ab100d1a7 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -33,13 +33,12 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_launch_orc_auto(fileutils, wlmutils): +def test_launch_orc_auto(test_dir, wlmutils): """test single node orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-orc" - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -65,14 +64,13 @@ def test_launch_orc_auto(fileutils, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_single(fileutils, wlmutils): +def test_launch_cluster_orc_single(test_dir, wlmutils): """test clustered 3-node orchestrator with single command""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-single" - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -99,14 +97,13 @@ def test_launch_cluster_orc_single(fileutils, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_multi(fileutils, wlmutils): +def test_launch_cluster_orc_multi(test_dir, wlmutils): """test clustered 3-node orchestrator with multiple commands""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-multi" - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/on_wlm/test_het_job.py b/tests/on_wlm/test_het_job.py index 63d78fb23..e8f20d1ee 100644 --- a/tests/on_wlm/test_het_job.py +++ b/tests/on_wlm/test_het_job.py @@ -29,11 +29,11 @@ from smartsim import Experiment from smartsim.settings import SrunSettings - # retrieved from pytest fixtures if pytest.test_launcher != "slurm": pytestmark = pytest.mark.skip(reason="Test is only for Slurm WLM systems") + def test_mpmd_errors(monkeypatch): monkeypatch.setenv("SLURM_HET_SIZE", "1") exp_name = "test-het-job-errors" @@ -50,15 +50,14 @@ def test_mpmd_errors(monkeypatch): def test_set_het_groups(monkeypatch): - """Test ability to set one or more het groups to run setting - """ + """Test ability to set one or more het groups to run setting""" monkeypatch.setenv("SLURM_HET_SIZE", "4") exp_name = "test-set-het-group" exp = Experiment(exp_name, launcher="slurm") rs: SrunSettings = exp.create_run_settings("sleep", "1", run_command="srun") rs.set_het_group([1]) assert rs.run_args["het-group"] == "1" - rs.set_het_group([3,2]) + rs.set_het_group([3, 2]) assert rs.run_args["het-group"] == "3,2" with pytest.raises(ValueError): rs.set_het_group([4]) diff --git a/tests/on_wlm/test_launch_errors.py b/tests/on_wlm/test_launch_errors.py index 77ba8a69a..7281cb568 100644 --- a/tests/on_wlm/test_launch_errors.py +++ b/tests/on_wlm/test_launch_errors.py @@ -36,12 +36,11 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_failed_status(fileutils, wlmutils): +def test_failed_status(fileutils, test_dir, wlmutils): """Test when a failure occurs deep into model execution""" exp_name = "test-report-failure" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("bad.py") settings = exp.create_run_settings( @@ -58,7 +57,7 @@ def test_failed_status(fileutils, wlmutils): assert stat[0] == status.STATUS_FAILED -def test_bad_run_command_args(fileutils, wlmutils): +def test_bad_run_command_args(fileutils, test_dir, wlmutils): """Should fail because of incorrect arguments given to the run command @@ -69,8 +68,7 @@ def test_bad_run_command_args(fileutils, wlmutils): pytest.skip(f"Only fails with slurm. Launcher is {launcher}") exp_name = "test-bad-run-command-args" - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) script = fileutils.get_test_conf_path("bad.py") diff --git a/tests/on_wlm/test_launch_ompi_lsf.py b/tests/on_wlm/test_launch_ompi_lsf.py index e3327514a..ed082d22e 100644 --- a/tests/on_wlm/test_launch_ompi_lsf.py +++ b/tests/on_wlm/test_launch_ompi_lsf.py @@ -34,13 +34,12 @@ @pytest.mark.skip("OpenMPI currently not working on LSF systems") -def test_launch_openmpi_lsf(wlmutils, fileutils): +def test_launch_openmpi_lsf(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() if launcher != "lsf": pytest.skip("Test only runs on systems with LSF as WLM") exp_name = "test-launch-openmpi-lsf" - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", script, "mpirun") diff --git a/tests/on_wlm/test_local_step.py b/tests/on_wlm/test_local_step.py index 0613f41c2..02e619ebf 100644 --- a/tests/on_wlm/test_local_step.py +++ b/tests/on_wlm/test_local_step.py @@ -26,6 +26,7 @@ import os import uuid + import pytest from smartsim import Experiment, status @@ -40,13 +41,12 @@ """ -def test_local_env_pass_implicit(fileutils) -> None: +def test_local_env_pass_implicit(fileutils, test_dir) -> None: """Ensure implicitly exported env is available to running task""" exp_value = str(uuid.uuid4()) env_key = "test_local_env_pass_implicit" os.environ[env_key] = exp_value - test_dir = fileutils.make_test_dir() exp_dir = f"{test_dir}/exp" os.makedirs(exp_dir) script = fileutils.get_test_conf_path("check_env.py") @@ -72,19 +72,18 @@ def test_local_env_pass_implicit(fileutils) -> None: with open(f"{exp_dir}/{app_name}/{app_name}.out") as app_outfile: app_output = app_outfile.read() - + # verify application was able to access the env var assert f"{env_key}=={exp_value}" in app_output -def test_local_env_pass_explicit(fileutils) -> None: +def test_local_env_pass_explicit(fileutils, test_dir) -> None: """Ensure explicitly exported env is available to running task""" exp_value = str(uuid.uuid4()) env_key = "test_local_env_pass_explicit" assert env_key not in os.environ - test_dir = fileutils.make_test_dir() script = fileutils.get_test_conf_path("check_env.py") exp_dir = f"{test_dir}/exp" @@ -111,6 +110,6 @@ def test_local_env_pass_explicit(fileutils) -> None: with open(f"{exp_dir}/{app_name}/{app_name}.out") as app_outfile: app_output = app_outfile.read() - + # verify application was able to access the env var assert f"{env_key}=={exp_value}" in app_output diff --git a/tests/on_wlm/test_restart.py b/tests/on_wlm/test_restart.py index 86d883358..b1997961d 100644 --- a/tests/on_wlm/test_restart.py +++ b/tests/on_wlm/test_restart.py @@ -35,11 +35,9 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_restart(fileutils, wlmutils): - +def test_restart(fileutils, test_dir, wlmutils): exp_name = "test-restart" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_base_settings_on_wlm.py b/tests/on_wlm/test_simple_base_settings_on_wlm.py index d46a46aae..08bf875e2 100644 --- a/tests/on_wlm/test_simple_base_settings_on_wlm.py +++ b/tests/on_wlm/test_simple_base_settings_on_wlm.py @@ -48,7 +48,7 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_simple_model_on_wlm(fileutils, wlmutils): +def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: pytest.skip( @@ -56,8 +56,7 @@ def test_simple_model_on_wlm(fileutils, wlmutils): ) exp_name = "test-simplebase-settings-model-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") @@ -69,7 +68,7 @@ def test_simple_model_on_wlm(fileutils, wlmutils): assert exp.get_status(M)[0] == status.STATUS_COMPLETED -def test_simple_model_stop_on_wlm(fileutils, wlmutils): +def test_simple_model_stop_on_wlm(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: pytest.skip( @@ -77,8 +76,7 @@ def test_simple_model_stop_on_wlm(fileutils, wlmutils): ) exp_name = "test-simplebase-settings-model-stop" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index 14fd2e4ae..a05d7be0f 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -46,10 +46,9 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_models(fileutils, wlmutils): +def test_models(fileutils, test_dir, wlmutils): exp_name = "test-models-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -63,10 +62,9 @@ def test_models(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_ensemble(fileutils, wlmutils): +def test_ensemble(fileutils, test_dir, wlmutils): exp_name = "test-ensemble-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -80,12 +78,11 @@ def test_ensemble(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_summary(fileutils, wlmutils): +def test_summary(fileutils, test_dir, wlmutils): """Fairly rudimentary test of the summary dataframe""" exp_name = "test-launch-summary" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) sleep = fileutils.get_test_conf_path("sleep.py") bad = fileutils.get_test_conf_path("bad.py") @@ -103,7 +100,7 @@ def test_summary(fileutils, wlmutils): assert exp.get_status(bad)[0] == status.STATUS_FAILED assert exp.get_status(sleep)[0] == status.STATUS_COMPLETED - summary_str = exp.summary(format="plain") + summary_str = exp.summary(style="plain") print(summary_str) rows = [s.split() for s in summary_str.split("\n")] diff --git a/tests/on_wlm/test_slurm_commands.py b/tests/on_wlm/test_slurm_commands.py new file mode 100644 index 000000000..d3ebbcd31 --- /dev/null +++ b/tests/on_wlm/test_slurm_commands.py @@ -0,0 +1,55 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + +from smartsim._core.launcher.slurm.slurmCommands import * +from smartsim.error.errors import LauncherError + +# retrieved from pytest fixtures +if pytest.test_launcher != "slurm": + pytestmark = pytest.mark.skip(reason="Test is only for Slurm WLM systems") + + +# Test that common ways of launching commands +# raise when expected to do so +@pytest.mark.parametrize( + "cmd,raises", + [ + (sacct, True), + (sstat, True), + (sinfo, False), + (salloc, False), + (scancel, False), + (scontrol, False), + ], +) +def test_error_raises(cmd, raises): + args = ["--non_existing_arg"] + if raises: + with pytest.raises(LauncherError): + cmd(args, raise_on_err=True) + else: + cmd(args) diff --git a/tests/on_wlm/test_stop.py b/tests/on_wlm/test_stop.py index a786ce1a4..383c6c4bd 100644 --- a/tests/on_wlm/test_stop.py +++ b/tests/on_wlm/test_stop.py @@ -42,10 +42,9 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_stop_entity(fileutils, wlmutils): +def test_stop_entity(fileutils, test_dir, wlmutils): exp_name = "test-launch-stop-model" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") @@ -59,11 +58,9 @@ def test_stop_entity(fileutils, wlmutils): assert exp.get_status(M1)[0] == status.STATUS_CANCELLED -def test_stop_entity_list(fileutils, wlmutils): - +def test_stop_entity_list(fileutils, test_dir, wlmutils): exp_name = "test-launch-stop-ensemble" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") diff --git a/tests/test_alps_settings.py b/tests/test_alps_settings.py index 2d7577e9b..012f27fce 100644 --- a/tests/test_alps_settings.py +++ b/tests/test_alps_settings.py @@ -30,6 +30,9 @@ from smartsim.error import SSUnsupportedError from smartsim.settings import AprunSettings +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + def test_aprun_settings(): settings = AprunSettings("python") diff --git a/tests/test_batch_settings.py b/tests/test_batch_settings.py index abf685146..cb2096727 100644 --- a/tests/test_batch_settings.py +++ b/tests/test_batch_settings.py @@ -30,6 +30,9 @@ from smartsim.settings import BsubBatchSettings, QsubBatchSettings, SbatchSettings from smartsim.settings.settings import create_batch_settings +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + def test_create_pbs_batch(): pbs_batch = create_batch_settings( @@ -38,8 +41,7 @@ def test_create_pbs_batch(): args = pbs_batch.format_batch_args() assert isinstance(pbs_batch, QsubBatchSettings) assert args == [ - "-l select=1:ncpus=10", - "-l place=scatter", + "-l nodes=1:ncpus=10", "-l walltime=10:00:00", "-q default", "-A myproject", @@ -102,7 +104,7 @@ def test_existing_batch_args_mutation(): queue="default", batch_args=batch_args, ) - + # verify initial expectations assert "k1" in bsub.batch_args assert "k2" in bsub.batch_args @@ -114,6 +116,7 @@ def test_existing_batch_args_mutation(): assert bsub.batch_args["k1"] == orig_bargs["k1"] assert bsub.batch_args["k1"] != batch_args["k1"] + def test_direct_set_batch_args_mutation(): """ Ensure that if the batch_args dict is set directly, any previously @@ -129,7 +132,7 @@ def test_direct_set_batch_args_mutation(): queue="default", ) bsub.batch_args = batch_args - + # verify initial expectations assert "k1" in bsub.batch_args assert "k2" in bsub.batch_args diff --git a/tests/test_buildenv.py b/tests/test_buildenv.py index 91ae4d221..d362ca1dd 100644 --- a/tests/test_buildenv.py +++ b/tests/test_buildenv.py @@ -26,9 +26,12 @@ import pytest +from pkg_resources import packaging # type: ignore from smartsim._core._install.buildenv import Version_ -from pkg_resources import packaging # type: ignore + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a def test_version_hash_eq(): diff --git a/tests/test_cli.py b/tests/test_cli.py index ee07bc853..899caa1e0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -25,13 +25,16 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse -from contextlib import contextmanager +import logging +import os +import pathlib import typing as t +from contextlib import contextmanager import pytest import smartsim -from smartsim._core._cli import build, cli +from smartsim._core._cli import build, cli, plugin from smartsim._core._cli.build import configure_parser as build_parser from smartsim._core._cli.build import execute as build_execute from smartsim._core._cli.clean import configure_parser as clean_parser @@ -40,10 +43,21 @@ from smartsim._core._cli.dbcli import execute as dbcli_execute from smartsim._core._cli.site import execute as site_execute from smartsim._core._cli.utils import MenuItemConfig -from smartsim._core._cli.validate import ( - execute as validate_execute, - configure_parser as validate_parser, -) +from smartsim._core._cli.validate import configure_parser as validate_parser +from smartsim._core._cli.validate import execute as validate_execute + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +_TEST_LOGGER = logging.getLogger(__name__) + +try: + import smartdashboard +except: + test_dash_plugin = False +else: + test_dash_plugin = True + def mock_execute_custom(msg: str = None, good: bool = True) -> int: retval = 0 if good else 1 @@ -51,28 +65,32 @@ def mock_execute_custom(msg: str = None, good: bool = True) -> int: return retval -def mock_execute_good(_ns: argparse.Namespace) -> int: - return mock_execute_custom("GOOD THINGS", good = True) +def mock_execute_good( + _ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None +) -> int: + return mock_execute_custom("GOOD THINGS", good=True) -def mock_execute_fail(_ns: argparse.Namespace) -> int: - return mock_execute_custom("BAD THINGS", good = False) +def mock_execute_fail( + _ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None +) -> int: + return mock_execute_custom("BAD THINGS", good=False) def test_cli_default_args_parsing(capsys): """Test default parser behaviors with no subparsers""" menu: t.List[cli.MenuItemConfig] = [] - smart_cli = cli.SmartCli(menu) - + smart_cli = cli.SmartCli(menu) + captured = capsys.readouterr() # throw away existing output - with pytest.raises(SystemExit) as e: + with pytest.raises(SystemExit) as e: # the parser shouldn't get the `smart` CLI argument build_args = ["smart", "-h"] smart_cli.parser.parse_args(build_args) captured = capsys.readouterr() - assert "invalid choice: \'smart\'" in captured.err + assert "invalid choice: 'smart'" in captured.err assert e.value.code == 2 @@ -81,19 +99,18 @@ def test_cli_invalid_command(capsys): exp_help = "this is my mock help text for build" exp_cmd = "build" actual_cmd = f"not{exp_cmd}" - menu = [cli.MenuItemConfig(exp_cmd, - exp_help, - mock_execute_good, - build.configure_parser)] - smart_cli = cli.SmartCli(menu) - + menu = [ + cli.MenuItemConfig(exp_cmd, exp_help, mock_execute_good, build.configure_parser) + ] + smart_cli = cli.SmartCli(menu) + captured = capsys.readouterr() # throw away existing output with pytest.raises(SystemExit) as e: build_args = [actual_cmd, "-h"] smart_cli.parser.parse_args(build_args) - - captured = capsys.readouterr() # capture new output - + + captured = capsys.readouterr() # capture new output + # show that the command was not recognized assert "invalid choice" in captured.err assert e.value.code == 2 @@ -102,15 +119,15 @@ def test_cli_invalid_command(capsys): def test_cli_bad_default_args_parsing_bad_help(capsys): """Test passing an argument name that is incorrect""" menu: t.List[cli.MenuItemConfig] = [] - smart_cli = cli.SmartCli(menu) - + smart_cli = cli.SmartCli(menu) + captured = capsys.readouterr() # throw away existing output with pytest.raises(SystemExit) as e: build_args = ["--halp"] # <-- HELP vs HALP smart_cli.parser.parse_args(build_args) - - captured = capsys.readouterr() # capture new output - + + captured = capsys.readouterr() # capture new output + assert "smart: error:" in captured.err assert e.value.code == 2 @@ -118,39 +135,38 @@ def test_cli_bad_default_args_parsing_bad_help(capsys): def test_cli_bad_default_args_parsing_good_help(capsys): """Test passing an argument name that is correct""" menu: t.List[cli.MenuItemConfig] = [] - smart_cli = cli.SmartCli(menu) - + smart_cli = cli.SmartCli(menu) + captured = capsys.readouterr() # throw away existing output with pytest.raises(SystemExit) as e: build_args = ["-h"] smart_cli.parser.parse_args(build_args) - - captured = capsys.readouterr() # capture new output - + + captured = capsys.readouterr() # capture new output + assert "smart: error:" not in captured.out assert "usage: smart" in captured.out assert e.value.code == 0 def test_cli_add_subparser(capsys): - """Test that passing configuration for a command causes the command + """Test that passing configuration for a command causes the command to be added to the CLI """ exp_help = "this is my mock help text for build" exp_cmd = "build" - menu = [cli.MenuItemConfig(exp_cmd, - exp_help, - mock_execute_good, - build.configure_parser)] - smart_cli = cli.SmartCli(menu) - + menu = [ + cli.MenuItemConfig(exp_cmd, exp_help, mock_execute_good, build.configure_parser) + ] + smart_cli = cli.SmartCli(menu) + captured = capsys.readouterr() # throw away existing output with pytest.raises(SystemExit) as e: - build_args = [exp_cmd, "-h"] # <--- -h only + build_args = [exp_cmd, "-h"] # <--- -h only smart_cli.parser.parse_args(build_args) - - captured = capsys.readouterr() # capture new output - + + captured = capsys.readouterr() # capture new output + # show that -h showed the expected help text assert exp_help in captured.out assert e.value.code == 0 @@ -159,9 +175,9 @@ def test_cli_add_subparser(capsys): with pytest.raises(SystemExit) as e: build_args = [exp_cmd, "--help"] smart_cli.parser.parse_args(build_args) - - captured = capsys.readouterr() # capture new output - + + captured = capsys.readouterr() # capture new output + # show that --help ALSO works assert exp_help in captured.out assert e.value.code == 0 @@ -175,23 +191,23 @@ def test_cli_subparser_selection(capsys): exp_b_help = "this is my mock help text for build" exp_b_cmd = "build" - menu = [cli.MenuItemConfig(exp_a_cmd, - exp_a_help, - mock_execute_good, - build.configure_parser), - cli.MenuItemConfig(exp_b_cmd, - exp_b_help, - mock_execute_good, - build.configure_parser)] - smart_cli = cli.SmartCli(menu) - + menu = [ + cli.MenuItemConfig( + exp_a_cmd, exp_a_help, mock_execute_good, build.configure_parser + ), + cli.MenuItemConfig( + exp_b_cmd, exp_b_help, mock_execute_good, build.configure_parser + ), + ] + smart_cli = cli.SmartCli(menu) + captured = capsys.readouterr() # throw away existing output with pytest.raises(SystemExit) as e: - build_args = [exp_a_cmd, "-h"] # <--- -h only + build_args = [exp_a_cmd, "-h"] # <--- -h only smart_cli.parser.parse_args(build_args) - - captured = capsys.readouterr() # capture new output - + + captured = capsys.readouterr() # capture new output + # show that -h showed the expected help text for `smart dbcli -h` assert exp_a_help in captured.out assert e.value.code == 0 @@ -200,9 +216,9 @@ def test_cli_subparser_selection(capsys): with pytest.raises(SystemExit) as e: build_args = [exp_b_cmd, "--help"] smart_cli.parser.parse_args(build_args) - - captured = capsys.readouterr() # capture new output - + + captured = capsys.readouterr() # capture new output + # show that -h showed the expected help text for `smart build -h` assert exp_b_help in captured.out assert e.value.code == 0 @@ -215,27 +231,23 @@ def test_cli_command_execution(capsys): exp_b_help = "this is my mock help text for build" exp_b_cmd = "build" - - dbcli_exec = lambda x: mock_execute_custom(msg="Database", good=True) - build_exec = lambda x: mock_execute_custom(msg="Builder", good=True) - - menu = [cli.MenuItemConfig(exp_a_cmd, - exp_a_help, - dbcli_exec, - lambda x: None), - cli.MenuItemConfig(exp_b_cmd, - exp_b_help, - build_exec, - lambda x: None)] - smart_cli = cli.SmartCli(menu) - + + dbcli_exec = lambda x, y: mock_execute_custom(msg="Database", good=True) + build_exec = lambda x, y: mock_execute_custom(msg="Builder", good=True) + + menu = [ + cli.MenuItemConfig(exp_a_cmd, exp_a_help, dbcli_exec, lambda x: None), + cli.MenuItemConfig(exp_b_cmd, exp_b_help, build_exec, lambda x: None), + ] + smart_cli = cli.SmartCli(menu) + captured = capsys.readouterr() # throw away existing output - + build_args = ["smart", exp_a_cmd] ret_val = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + # show that `smart dbcli` calls the build parser and build execute function assert "Database" in captured.out assert ret_val == 0 @@ -243,8 +255,8 @@ def test_cli_command_execution(capsys): build_args = ["smart", exp_b_cmd] ret_val = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + # show that `smart build` calls the build parser and build execute function assert "Builder" in captured.out assert ret_val == 0 @@ -253,83 +265,139 @@ def test_cli_command_execution(capsys): def test_cli_default_cli(capsys): """Ensure the default CLI supports expected top-level commands""" smart_cli = cli.default_cli() - + captured = capsys.readouterr() # throw away existing output - + # execute with no argument, expect full help text build_args = ["smart"] ret_val = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + # show that `smart dbcli` calls the build parser and build execute function assert "usage: smart [-h] " in captured.out assert "Available commands" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `build` argument, expect build-specific help text with pytest.raises(SystemExit) as e: build_args = ["smart", "build", "-h"] ret_val = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + assert "usage: smart build [-h]" in captured.out assert "Build SmartSim dependencies" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `clean` argument, expect clean-specific help text with pytest.raises(SystemExit) as e: build_args = ["smart", "clean", "-h"] ret_val = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + assert "usage: smart clean [-h]" in captured.out assert "Remove previous ML runtime installation" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out assert "--clobber" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `dbcli` argument, expect dbcli-specific help text with pytest.raises(SystemExit) as e: build_args = ["smart", "dbcli", "-h"] ret_val = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + assert "usage: smart dbcli [-h]" in captured.out assert "Print the path to the redis-cli binary" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `site` argument, expect site-specific help text with pytest.raises(SystemExit) as e: build_args = ["smart", "site", "-h"] ret_val = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + assert "usage: smart site [-h]" in captured.out assert "Print the installation site of SmartSim" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `clobber` argument, expect clobber-specific help text with pytest.raises(SystemExit) as e: build_args = ["smart", "clobber", "-h"] ret_val = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + assert "usage: smart clobber [-h]" in captured.out assert "Remove all previous dependency installations" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out # assert "--clobber" not in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE + + +@pytest.mark.skipif(not test_dash_plugin, reason="plugin not found") +def test_cli_plugin_dashboard(capfd): + """Ensure expected dashboard CLI plugin commands are supported""" + smart_cli = cli.default_cli() + capfd.readouterr() # throw away existing output + + # execute with `dashboard` argument, expect dashboard-specific help text + build_args = ["smart", "dashboard", "-h"] + rc = smart_cli.execute(build_args) + captured = capfd.readouterr() # capture new output + assert "[-d DIRECTORY]" in captured.out + assert "[-p PORT]" in captured.out + + assert "optional arguments:" in captured.out + assert rc == 0 + + +def test_cli_plugin_invalid( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +): + """Ensure unexpected CLI plugins are reported""" + import smartsim._core._cli.cli + import smartsim._core._cli.plugin + + plugin_module = "notinstalled.Experiment_Overview" + bad_plugins = [ + lambda: MenuItemConfig( + "dashboard", + "Start the SmartSim dashboard", + plugin.dynamic_execute(plugin_module, "Dashboard!"), + is_plugin=True, + ) + ] + monkeypatch.setattr(smartsim._core._cli.cli, "plugins", bad_plugins) + # Coloredlogs doesn't play nice with caplog + monkeypatch.setattr( + smartsim._core._cli.plugin, + "_LOGGER", + _TEST_LOGGER, + ) + + smart_cli = cli.default_cli() + + # execute with `dashboard` argument, expect failure to find dashboard plugin + build_args = ["smart", "dashboard", "-h"] + + rc = smart_cli.execute(build_args) + + assert plugin_module in caplog.text + assert "not found" in caplog.text + assert rc == os.EX_CONFIG + + +# fmt: off @pytest.mark.parametrize( "command,mock_location,exp_output", [ @@ -342,28 +410,31 @@ def test_cli_default_cli(capsys): pytest.param("info", "info_execute", "mocked-validate", id="ensure info action is executed"), ] ) +# fmt: on def test_cli_action(capsys, monkeypatch, command, mock_location, exp_output): """Ensure the default CLI executes the build action""" - def mock_execute(ns: argparse.Namespace): + + def mock_execute(ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None): print(exp_output) return 0 monkeypatch.setattr(smartsim._core._cli.cli, mock_location, mock_execute) - + smart_cli = cli.default_cli() - + captured = capsys.readouterr() # throw away existing output - + # execute with `` argument, expect -specific output text build_args = ["smart", command] ret_val = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + assert exp_output in captured.out assert ret_val == 0 +# fmt: off @pytest.mark.parametrize( "command,mock_location,exp_output,optional_arg,exp_valid,exp_err_msg,check_prop,exp_prop_val", [ @@ -385,47 +456,49 @@ def mock_execute(ns: argparse.Namespace): pytest.param("validate", "validate_execute", "gpuX mocked-validate", "--device=gpux", False, "invalid choice: 'gpux'", "", "", id="validate: set bad device 'gpuX'"), ] ) -def test_cli_optional_args(capsys, - monkeypatch, - command: str, - mock_location: str, - exp_output: str, - optional_arg: str, - exp_valid: bool, - exp_err_msg: str, - check_prop: str, - exp_prop_val: t.Any): +# fmt: on +def test_cli_optional_args( + capsys, + monkeypatch, + command: str, + mock_location: str, + exp_output: str, + optional_arg: str, + exp_valid: bool, + exp_err_msg: str, + check_prop: str, + exp_prop_val: t.Any, +): """Ensure the parser for a command handles expected optional arguments""" - def mock_execute(ns: argparse.Namespace): + + def mock_execute(ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None): print(exp_output) return 0 monkeypatch.setattr(smartsim._core._cli.cli, mock_location, mock_execute) - + smart_cli = cli.default_cli() - + captured = capsys.readouterr() # throw away existing output - + build_args = ["smart", command] + optional_arg.split() if exp_valid: ret_val = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output + captured = capsys.readouterr() # capture new output assert exp_output in captured.out # did the expected execution method occur? assert ret_val == 0 # is the retval is non-failure code? - - # is the value from the optional argument set in the parsed args? - assert smart_cli.args.__dict__[check_prop] == exp_prop_val else: with pytest.raises(SystemExit) as e: ret_val = smart_cli.execute(build_args) assert ret_val > 0 - captured = capsys.readouterr() # capture new output + captured = capsys.readouterr() # capture new output assert exp_err_msg in captured.err +# fmt: off @pytest.mark.parametrize( "command,mock_location,mock_output,exp_output", [ @@ -438,33 +511,38 @@ def mock_execute(ns: argparse.Namespace): pytest.param("info", "info_execute", "helpful mocked-validate", "usage: smart info", id="info"), ] ) -def test_cli_help_support(capsys, - monkeypatch, - command: str, - mock_location: str, - mock_output: str, - exp_output: str): +# fmt: on +def test_cli_help_support( + capsys, + monkeypatch, + command: str, + mock_location: str, + mock_output: str, + exp_output: str, +): """Ensure the parser supports help optional for commands as expected""" - def mock_execute(ns: argparse.Namespace): + + def mock_execute(ns: argparse.Namespace, unparsed: t.Optional[t.List[str]] = None): print(mock_output) return 0 monkeypatch.setattr(smartsim._core._cli.cli, mock_location, mock_execute) - + smart_cli = cli.default_cli() - + captured = capsys.readouterr() # throw away existing output - + # execute with `` argument, expect -specific help text build_args = ["smart", command] + ["-h"] with pytest.raises(SystemExit) as e: ret_val = smart_cli.execute(build_args) assert ret_val == 0 - captured = capsys.readouterr() # capture new output + captured = capsys.readouterr() # capture new output assert exp_output in captured.out +# fmt: off @pytest.mark.parametrize( "command,mock_location,exp_output", [ @@ -477,29 +555,29 @@ def mock_execute(ns: argparse.Namespace): pytest.param("info", "info_execute", "verbose mocked-validate", id="validate"), ] ) -def test_cli_invalid_optional_args(capsys, - monkeypatch, - command: str, - mock_location: str, - exp_output: str): +# fmt: on +def test_cli_invalid_optional_args( + capsys, monkeypatch, command: str, mock_location: str, exp_output: str +): """Ensure the parser throws expected error for an invalid argument""" - def mock_execute(ns: argparse.Namespace): + + def mock_execute(ns: argparse.Namespace, unparsed: t.Optional[t.List[str]] = None): print(exp_output) return 0 monkeypatch.setattr(smartsim._core._cli.cli, mock_location, mock_execute) - + smart_cli = cli.default_cli() - + captured = capsys.readouterr() # throw away existing output - + # execute with `` argument, expect CLI to raise invalid arg error build_args = ["smart", command] + ["-xyz"] with pytest.raises(SystemExit) as e: ret_val = smart_cli.execute(build_args) assert ret_val > 0 - captured = capsys.readouterr() # capture new output + captured = capsys.readouterr() # capture new output assert "unrecognized argument" in captured.err @@ -513,57 +591,56 @@ def mock_execute(ns: argparse.Namespace): pytest.param("site", id="site"), pytest.param("validate", id="validate"), pytest.param("info", id="info"), - ] + ], ) def test_cli_invalid_optional_args(capsys, command): """Ensure the parser throws expected error for an invalid command""" smart_cli = cli.default_cli() - + captured = capsys.readouterr() # throw away existing output - + # execute with `` argument, expect CLI to raise invalid arg error build_args = ["smart", command] + ["-xyz"] with pytest.raises(SystemExit) as e: ret_val = smart_cli.execute(build_args) assert ret_val > 0 - captured = capsys.readouterr() # capture new output + captured = capsys.readouterr() # capture new output assert "unrecognized argument" in captured.err def test_cli_full_clean_execute(capsys, monkeypatch): - """Ensure that the execute method of clean is called """ + """Ensure that the execute method of clean is called""" exp_retval = 0 exp_output = "mocked-clean utility" - def mock_operation(*args, **kwargs) -> int: + # mock out the internal clean method so we don't actually delete anything + def mock_clean(core_path: pathlib.Path, _all: bool = False) -> int: print(exp_output) return exp_retval - # mock out the internal clean method so we don't actually delete anything - monkeypatch.setattr(smartsim._core._cli.clean, "clean", mock_operation) + monkeypatch.setattr(smartsim._core._cli.clean, "clean", mock_clean) command = "clean" - cfg = MenuItemConfig(command, - f"test {command} help text", - clean_execute, - clean_parser) + cfg = MenuItemConfig( + command, f"test {command} help text", clean_execute, clean_parser + ) menu = [cfg] smart_cli = cli.SmartCli(menu) - + captured = capsys.readouterr() # throw away existing output - + build_args = ["smart", command] actual_retval = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + assert exp_output in captured.out assert actual_retval == exp_retval def test_cli_full_clobber_execute(capsys, monkeypatch): - """Ensure that the execute method of clobber is called """ + """Ensure that the execute method of clobber is called""" exp_retval = 0 exp_output = "mocked-clobber utility" @@ -575,25 +652,23 @@ def mock_operation(*args, **kwargs) -> int: monkeypatch.setattr(smartsim._core._cli.clean, "clean", mock_operation) command = "clobber" - cfg = MenuItemConfig(command, - f"test {command} help text", - clobber_execute) + cfg = MenuItemConfig(command, f"test {command} help text", clobber_execute) menu = [cfg] smart_cli = cli.SmartCli(menu) - + captured = capsys.readouterr() # throw away existing output - + build_args = ["smart", command] actual_retval = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + assert exp_output in captured.out assert actual_retval == exp_retval def test_cli_full_dbcli_execute(capsys, monkeypatch): - """Ensure that the execute method of dbcli is called """ + """Ensure that the execute method of dbcli is called""" exp_retval = 0 exp_output = "mocked-get_db_path utility" @@ -604,25 +679,23 @@ def mock_operation(*args, **kwargs) -> int: monkeypatch.setattr(smartsim._core._cli.dbcli, "get_db_path", mock_operation) command = "dbcli" - cfg = MenuItemConfig(command, - f"test {command} help text", - dbcli_execute) + cfg = MenuItemConfig(command, f"test {command} help text", dbcli_execute) menu = [cfg] smart_cli = cli.SmartCli(menu) - + captured = capsys.readouterr() # throw away existing output - + build_args = ["smart", command] actual_retval = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + assert exp_output in captured.out assert actual_retval == exp_retval def test_cli_full_site_execute(capsys, monkeypatch): - """Ensure that the execute method of site is called """ + """Ensure that the execute method of site is called""" exp_retval = 0 exp_output = "mocked-get_install_path utility" @@ -634,25 +707,23 @@ def mock_operation(*args, **kwargs) -> int: monkeypatch.setattr(smartsim._core._cli.site, "get_install_path", mock_operation) command = "site" - cfg = MenuItemConfig(command, - f"test {command} help text", - site_execute) + cfg = MenuItemConfig(command, f"test {command} help text", site_execute) menu = [cfg] smart_cli = cli.SmartCli(menu) - + captured = capsys.readouterr() # throw away existing output - + build_args = ["smart", command] actual_retval = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + assert exp_output in captured.out assert actual_retval == exp_retval def test_cli_full_build_execute(capsys, monkeypatch): - """Ensure that the execute method of build is called """ + """Ensure that the execute method of build is called""" exp_retval = 0 exp_output = "mocked-execute-build utility" @@ -664,31 +735,36 @@ def mock_operation(*args, **kwargs) -> int: monkeypatch.setattr(smartsim._core._cli.build, "tabulate", mock_operation) monkeypatch.setattr(smartsim._core._cli.build, "build_database", mock_operation) monkeypatch.setattr(smartsim._core._cli.build, "build_redis_ai", mock_operation) - monkeypatch.setattr(smartsim._core._cli.build, "check_py_torch_version", mock_operation) - monkeypatch.setattr(smartsim._core._cli.build, "check_py_tf_version", mock_operation) - monkeypatch.setattr(smartsim._core._cli.build, "check_py_onnx_version", mock_operation) + monkeypatch.setattr( + smartsim._core._cli.build, "check_py_torch_version", mock_operation + ) + monkeypatch.setattr( + smartsim._core._cli.build, "check_py_tf_version", mock_operation + ) + monkeypatch.setattr( + smartsim._core._cli.build, "check_py_onnx_version", mock_operation + ) command = "build" - cfg = MenuItemConfig(command, - f"test {command} help text", - build_execute, - build_parser) + cfg = MenuItemConfig( + command, f"test {command} help text", build_execute, build_parser + ) menu = [cfg] smart_cli = cli.SmartCli(menu) - + captured = capsys.readouterr() # throw away existing output - + build_args = ["smart", command] actual_retval = smart_cli.execute(build_args) - captured = capsys.readouterr() # capture new output - + captured = capsys.readouterr() # capture new output + assert exp_output in captured.out assert actual_retval == exp_retval def _good_build(*args, **kwargs): - print("LGTM") + _TEST_LOGGER.info("LGTM") def _bad_build(*args, **kwargs): @@ -703,17 +779,17 @@ def _mock_temp_dir(*a, **kw): @pytest.mark.parametrize( "mock_verify_fn, expected_stdout, expected_retval", [ - pytest.param(_good_build, 'LGTM', 0, id="Configured Correctly"), + pytest.param(_good_build, "LGTM", os.EX_OK, id="Configured Correctly"), pytest.param( _bad_build, - "SmartSim failed to run a simple experiment", - 2, + "SmartSim failed to run a simple experiment", + os.EX_SOFTWARE, id="Configured Incorrectly", - ) + ), ], ) -def test_cli_build_test_execute( - capsys, +def test_cli_validation_test_execute( + caplog, monkeypatch, mock_verify_fn, expected_stdout, @@ -724,6 +800,7 @@ def test_cli_build_test_execute( checks that if at any point the test raises an exception an appropriate error code and error msg are returned. """ + caplog.set_level(logging.INFO) # Mock out the verification tests/avoid file system ops monkeypatch.setattr(smartsim._core._cli.validate, "test_install", mock_verify_fn) @@ -732,27 +809,22 @@ def test_cli_build_test_execute( "_VerificationTempDir", _mock_temp_dir, ) - # Coloredlogs doesn't play nice with capsys + # Coloredlogs doesn't play nice with caplog monkeypatch.setattr( - smartsim._core._cli.validate.logger, - "error", - print, + smartsim._core._cli.validate, + "logger", + _TEST_LOGGER, ) command = "validate" - cfg = MenuItemConfig(command, - f"test {command} help text", - validate_execute, - validate_parser) + cfg = MenuItemConfig( + command, f"test {command} help text", validate_execute, validate_parser + ) menu = [cfg] smart_cli = cli.SmartCli(menu) - captured = capsys.readouterr() # throw away existing output - verify_args = ["smart", command] actual_retval = smart_cli.execute(verify_args) - captured = capsys.readouterr() # capture new output - - assert expected_stdout in captured.out + assert expected_stdout in caplog.text assert actual_retval == expected_retval diff --git a/tests/test_cobalt_parser.py b/tests/test_cobalt_parser.py index 16f8c0b81..e91c95100 100644 --- a/tests/test_cobalt_parser.py +++ b/tests/test_cobalt_parser.py @@ -24,9 +24,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest from smartsim._core.launcher.cobalt import cobaltParser +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + def test_parse_step_id(): output = "JobName JobId \n" "=====================\n" "smartsim 507975 \n" diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index 376c71f26..e68801762 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -29,84 +29,103 @@ import pytest from smartsim import Experiment, status -from smartsim.error import SSUnsupportedError from smartsim.entity import Model +from smartsim.error import SSUnsupportedError + +# The tests in this file belong to the slow_tests group +pytestmark = pytest.mark.slow_tests + if sys.platform == "darwin": supported_dbs = ["tcp", "deprecated"] else: supported_dbs = ["uds", "tcp", "deprecated"] -is_mac = sys.platform == 'darwin' +is_mac = sys.platform == "darwin" + -@pytest.mark.skipif(not is_mac, reason='MacOS-only test') -def test_macosx_warning(fileutils, coloutils): - db_args = {"custom_pinning":[1]} - db_type = 'uds' # Test is insensitive to choice of db +@pytest.mark.skipif(not is_mac, reason="MacOS-only test") +def test_macosx_warning(fileutils, test_dir, coloutils): + db_args = {"custom_pinning": [1]} + db_type = "uds" # Test is insensitive to choice of db - exp = Experiment("colocated_model_defaults", launcher="local") + exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.warns( RuntimeWarning, - match="CPU pinning is not supported on MacOSX. Ignoring pinning specification." + match="CPU pinning is not supported on MacOSX. Ignoring pinning specification.", ): - colo_model = coloutils.setup_test_colo( + _ = coloutils.setup_test_colo( fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, ) -def test_unsupported_limit_app(fileutils, coloutils): - db_args = {"limit_app_cpus":True} - db_type = 'uds' # Test is insensitive to choice of db - exp = Experiment("colocated_model_defaults", launcher="local") +def test_unsupported_limit_app(fileutils, test_dir, coloutils): + db_args = {"limit_app_cpus": True} + db_type = "uds" # Test is insensitive to choice of db + + exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.raises(SSUnsupportedError): - colo_model = coloutils.setup_test_colo( + coloutils.setup_test_colo( fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, ) + @pytest.mark.skipif(is_mac, reason="Unsupported on MacOSX") -@pytest.mark.parametrize("custom_pinning", [1,"10","#",1.,['a'],[1.]]) -def test_unsupported_custom_pinning(fileutils, coloutils, custom_pinning): - db_type = "uds" # Test is insensitive to choice of db +@pytest.mark.parametrize("custom_pinning", [1, "10", "#", 1.0, ["a"], [1.0]]) +def test_unsupported_custom_pinning(fileutils, test_dir, coloutils, custom_pinning): + db_type = "uds" # Test is insensitive to choice of db db_args = {"custom_pinning": custom_pinning} - exp = Experiment("colocated_model_defaults", launcher="local") + exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.raises(TypeError): - colo_model = coloutils.setup_test_colo( + coloutils.setup_test_colo( fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, ) + @pytest.mark.skipif(is_mac, reason="Unsupported on MacOSX") -@pytest.mark.parametrize("pin_list, num_cpus, expected", [ - pytest.param(None, 2, "0,1", id="Automatic creation of pinned cpu list"), - pytest.param([1,2], 2, "1,2", id="Individual ids only"), - pytest.param([range(2),3], 3, "0,1,3", id="Mixed ranges and individual ids"), - pytest.param(range(3), 3, "0,1,2", id="Range only"), - pytest.param([range(8, 10), range(6, 1, -2)], 4, "2,4,6,8,9", id="Multiple ranges"), -]) +@pytest.mark.parametrize( + "pin_list, num_cpus, expected", + [ + pytest.param(None, 2, "0,1", id="Automatic creation of pinned cpu list"), + pytest.param([1, 2], 2, "1,2", id="Individual ids only"), + pytest.param([range(2), 3], 3, "0,1,3", id="Mixed ranges and individual ids"), + pytest.param(range(3), 3, "0,1,2", id="Range only"), + pytest.param( + [range(8, 10), range(6, 1, -2)], 4, "2,4,6,8,9", id="Multiple ranges" + ), + ], +) def test_create_pinning_string(pin_list, num_cpus, expected): assert Model._create_pinning_string(pin_list, num_cpus) == expected @pytest.mark.parametrize("db_type", supported_dbs) -def test_launch_colocated_model_defaults(fileutils, coloutils, db_type, launcher="local"): +def test_launch_colocated_model_defaults( + fileutils, test_dir, coloutils, db_type, launcher="local" +): """Test the launch of a model with a colocated database and local launcher""" - db_args = { } + db_args = {} - exp = Experiment("colocated_model_defaults", launcher=launcher) + exp = Experiment("colocated_model_defaults", launcher=launcher, exp_path=test_dir) colo_model = coloutils.setup_test_colo( fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, ) @@ -114,20 +133,69 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type, launcher true_pinning = None else: true_pinning = "0" - assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning + assert ( + colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning + ) + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses) # test restarting the colocated model exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all( + stat == status.STATUS_COMPLETED for stat in statuses + ), f"Statuses {statuses}" + @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_disable_pinning(fileutils, coloutils, db_type, launcher="local"): +def test_launch_multiple_colocated_models( + fileutils, test_dir, coloutils, wlmutils, db_type, launcher="local" +): + """Test the concurrent launch of two models with a colocated database and local launcher""" + + db_args = {} + + exp = Experiment("multi_colo_models", launcher=launcher, exp_path=test_dir) + colo_models = [ + coloutils.setup_test_colo( + fileutils, + db_type, + exp, + "send_data_local_smartredis.py", + db_args, + colo_model_name="colo0", + port=wlmutils.get_test_port(), + ), + coloutils.setup_test_colo( + fileutils, + db_type, + exp, + "send_data_local_smartredis.py", + db_args, + colo_model_name="colo1", + port=wlmutils.get_test_port() + 1, + ), + ] + exp.generate(*colo_models) + exp.start(*colo_models, block=True) + statuses = exp.get_status(*colo_models) + assert all(stat == status.STATUS_COMPLETED for stat in statuses) + + # test restarting the colocated model + exp.start(*colo_models, block=True) + statuses = exp.get_status(*colo_models) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + - exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher) +@pytest.mark.parametrize("db_type", supported_dbs) +def test_colocated_model_disable_pinning( + fileutils, test_dir, coloutils, db_type, launcher="local" +): + exp = Experiment( + "colocated_model_pinning_auto_1cpu", launcher=launcher, exp_path=test_dir + ) db_args = { "db_cpus": 1, "custom_pinning": [], @@ -137,17 +205,23 @@ def test_colocated_model_disable_pinning(fileutils, coloutils, db_type, launcher fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] is None + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type, launcher="local"): - exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher) +@pytest.mark.parametrize("db_type", supported_dbs) +def test_colocated_model_pinning_auto_2cpu( + fileutils, test_dir, coloutils, db_type, launcher="local" +): + exp = Experiment( + "colocated_model_pinning_auto_2cpu", launcher=launcher, exp_path=test_dir + ) db_args = { "db_cpus": 2, @@ -158,59 +232,82 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type, launch fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, ) if is_mac: true_pinning = None else: true_pinning = "0,1" - assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning + assert ( + colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning + ) + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher="local"): +def test_colocated_model_pinning_range( + fileutils, test_dir, coloutils, db_type, launcher="local" +): # Check to make sure that the CPU mask was correctly generated - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment( + "colocated_model_pinning_manual", launcher=launcher, exp_path=test_dir + ) - db_args = { - "db_cpus": 2, - "custom_pinning": range(2) - } + db_args = {"db_cpus": 2, "custom_pinning": range(2)} colo_model = coloutils.setup_test_colo( fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_list(fileutils, coloutils, db_type, launcher="local"): +def test_colocated_model_pinning_list( + fileutils, test_dir, coloutils, db_type, launcher="local" +): # Check to make sure that the CPU mask was correctly generated - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment( + "colocated_model_pinning_manual", launcher=launcher, exp_path=test_dir + ) - db_args = { - "db_cpus": 1, - "custom_pinning": [1] - } + db_args = {"db_cpus": 1, "custom_pinning": [1]} colo_model = coloutils.setup_test_colo( fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "1" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) \ No newline at end of file + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + +def test_colo_uds_verifies_socket_file_name(test_dir, launcher="local"): + exp = Experiment(f"colo_uds_wrong_name", launcher=launcher, exp_path=test_dir) + + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=["--version"]) + + colo_model = exp.create_model("wrong_uds_socket_name", colo_settings) + + with pytest.raises(ValueError): + colo_model.colocate_db_uds(unix_socket="this is not a valid name!") diff --git a/tests/test_colo_model_lsf.py b/tests/test_colo_model_lsf.py index 7c4fd0e01..e77eeedec 100644 --- a/tests/test_colo_model_lsf.py +++ b/tests/test_colo_model_lsf.py @@ -28,17 +28,21 @@ import pytest +import smartsim.settings.base from smartsim import Experiment from smartsim.entity import Model -import smartsim.settings.base from smartsim.settings.lsfSettings import JsrunSettings +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a -is_mac = sys.platform == 'darwin' + +is_mac = sys.platform == "darwin" class ExpectationMet(Exception): """Use this to break a test when we verify a call path is achieved""" + ... @@ -52,8 +56,8 @@ def test_jsrun_prep(fileutils, coloutils, monkeypatch): # mock the prep method to raise an exception that short circuits test when goal is met monkeypatch.setattr(JsrunSettings, "_prep_colocated_db", show_expectation_met) - db_args = {"custom_pinning":[1]} - db_type = 'uds' # Test is insensitive to choice of db + db_args = {"custom_pinning": [1]} + db_type = "uds" # Test is insensitive to choice of db exp = Experiment("colocated_model_lsf", launcher="lsf") @@ -64,19 +68,20 @@ def test_jsrun_prep(fileutils, coloutils, monkeypatch): fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, - colo_settings = run_settings, + colo_settings=run_settings, ) def test_non_js_run_prep(fileutils, coloutils, monkeypatch): - """Ensure that RunSettings does not attempt to call a prep method""" + """Ensure that RunSettings does not attempt to call a prep method""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") # mock prep method to ensure that the exception isn't thrown w/non-JsrunSettings arg monkeypatch.setattr(JsrunSettings, "_prep_colocated_db", show_expectation_met) - db_args = {"custom_pinning":[1]} - db_type = 'tcp' # Test is insensitive to choice of db + db_args = {"custom_pinning": [1]} + db_type = "tcp" # Test is insensitive to choice of db exp = Experiment("colocated_model_lsf", launcher="lsf") @@ -86,31 +91,42 @@ def test_non_js_run_prep(fileutils, coloutils, monkeypatch): fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, - colo_settings = run_settings, + colo_settings=run_settings, ) assert colo_model @pytest.mark.parametrize( - "exp_run_arg_key,run_arg_key,exp_value,test_value", - [ - pytest.param("cpu_per_rs", "cpu_per_rs", 11, 11, id="cpu_per_rs matches input"), - pytest.param("c", "c", 22, 22, id="c matches input"), - pytest.param("cpu_per_rs", "cpu_per_rsx", 1, 33, id="key typo: cpu_per_rsx gives default"), - pytest.param("cpu_per_rs", "cx", 1, 44, id="key typo: cx gives default"), - ] + "exp_run_arg_key,run_arg_key,exp_value,test_value", + [ + pytest.param("cpu_per_rs", "cpu_per_rs", 11, 11, id="cpu_per_rs matches input"), + pytest.param("c", "c", 22, 22, id="c matches input"), + pytest.param( + "cpu_per_rs", "cpu_per_rsx", 1, 33, id="key typo: cpu_per_rsx gives default" + ), + pytest.param("cpu_per_rs", "cx", 1, 44, id="key typo: cx gives default"), + ], ) -def test_jsrun_prep_cpu_per_flag_set_check(fileutils, coloutils, monkeypatch, exp_run_arg_key, run_arg_key, exp_value, test_value): +def test_jsrun_prep_cpu_per_flag_set_check( + fileutils, + coloutils, + monkeypatch, + exp_run_arg_key, + run_arg_key, + exp_value, + test_value, +): """Ensure that _prep_colocated_db honors basic cpu_per_rs config and allows a - valid input parameter to result in the correct output. If no expected input (or + valid input parameter to result in the correct output. If no expected input (or incorrect key) is given, the default should be returned using default config key""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") # excluding "db_cpus" should result in default value in comparison & output - db_args = {"custom_pinning":[1]} - db_type = 'uds' # Test is insensitive to choice of db + db_args = {"custom_pinning": [1]} + db_type = "uds" # Test is insensitive to choice of db exp = Experiment("colocated_model_lsf", launcher="lsf") @@ -121,29 +137,42 @@ def test_jsrun_prep_cpu_per_flag_set_check(fileutils, coloutils, monkeypatch, ex fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, - colo_settings = run_settings, + colo_settings=run_settings, ) assert colo_model.run_settings.run_args[exp_run_arg_key] == exp_value @pytest.mark.parametrize( - "exp_run_arg_key,run_arg_key,exp_value,test_value", - [ - pytest.param("cpu_per_rs", "cpu_per_rs", 11, 11, id="cpu_per_rs matches input"), - pytest.param("c", "c", 22, 22, id="c matches input"), - pytest.param("cpu_per_rs", "cpu_per_rsx", 3, 33, id="key typo: db_cpus out (not default)"), - pytest.param("cpu_per_rs", "cx", 3, 44, id="key typo: get db_cpus out (not default)"), - ] + "exp_run_arg_key,run_arg_key,exp_value,test_value", + [ + pytest.param("cpu_per_rs", "cpu_per_rs", 11, 11, id="cpu_per_rs matches input"), + pytest.param("c", "c", 22, 22, id="c matches input"), + pytest.param( + "cpu_per_rs", "cpu_per_rsx", 3, 33, id="key typo: db_cpus out (not default)" + ), + pytest.param( + "cpu_per_rs", "cx", 3, 44, id="key typo: get db_cpus out (not default)" + ), + ], ) -def test_jsrun_prep_db_cpu_override(fileutils, coloutils, monkeypatch, exp_run_arg_key, run_arg_key, exp_value, test_value): +def test_jsrun_prep_db_cpu_override( + fileutils, + coloutils, + monkeypatch, + exp_run_arg_key, + run_arg_key, + exp_value, + test_value, +): """Ensure that both cpu_per_rs and c input config override db_cpus""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") # setting "db_cpus" should result in non-default value in comparison & output - db_args = {"custom_pinning":[1], "db_cpus": 3} - db_type = 'tcp' # Test is insensitive to choice of db + db_args = {"custom_pinning": [1], "db_cpus": 3} + db_type = "tcp" # Test is insensitive to choice of db exp = Experiment("colocated_model_lsf", launcher="lsf") @@ -154,29 +183,40 @@ def test_jsrun_prep_db_cpu_override(fileutils, coloutils, monkeypatch, exp_run_a fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, - colo_settings = run_settings, + colo_settings=run_settings, ) assert colo_model.run_settings.run_args[exp_run_arg_key] == exp_value @pytest.mark.parametrize( - "exp_run_arg_key,run_arg_key,exp_value,test_value", - [ - pytest.param("cpu_per_rs", "cpu_per_rs", 8, 3, id="cpu_per_rs swaps to db_cpus"), - pytest.param("c", "c", 8, 4, id="c swaps to db_cpus"), - pytest.param("cpu_per_rs", "cpu_per_rsx", 8, 5, id="key typo: db_cpus out"), - pytest.param("cpu_per_rs", "cx", 8, 6, id="key typo: get db_cpus out"), - ] + "exp_run_arg_key,run_arg_key,exp_value,test_value", + [ + pytest.param( + "cpu_per_rs", "cpu_per_rs", 8, 3, id="cpu_per_rs swaps to db_cpus" + ), + pytest.param("c", "c", 8, 4, id="c swaps to db_cpus"), + pytest.param("cpu_per_rs", "cpu_per_rsx", 8, 5, id="key typo: db_cpus out"), + pytest.param("cpu_per_rs", "cx", 8, 6, id="key typo: get db_cpus out"), + ], ) -def test_jsrun_prep_db_cpu_replacement(fileutils, coloutils, monkeypatch, exp_run_arg_key, run_arg_key, exp_value, test_value): +def test_jsrun_prep_db_cpu_replacement( + fileutils, + coloutils, + monkeypatch, + exp_run_arg_key, + run_arg_key, + exp_value, + test_value, +): """Ensure that db_cpus default is used if user config suggests underutilizing resources""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") # setting "db_cpus" should result in non-default value in comparison & output - db_args = {"custom_pinning":[1], "db_cpus": 8} - db_type = 'uds' # Test is insensitive to choice of db + db_args = {"custom_pinning": [1], "db_cpus": 8} + db_type = "uds" # Test is insensitive to choice of db exp = Experiment("colocated_model_lsf", launcher="lsf") @@ -187,31 +227,46 @@ def test_jsrun_prep_db_cpu_replacement(fileutils, coloutils, monkeypatch, exp_ru fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, - colo_settings = run_settings, + colo_settings=run_settings, ) assert colo_model.run_settings.run_args[exp_run_arg_key] == exp_value @pytest.mark.parametrize( - "exp_run_arg_key,run_arg_key,exp_value,test_value", - [ - pytest.param("rs_per_host", "rs_per_host", 1, 1, id="rs_per_host is 1"), - pytest.param("r", "r", 1, 1, id="r is 1"), - pytest.param("rs_per_host", "rs_per_host", 1, 2, id="rs_per_host replaced w/1"), - pytest.param("r", "r", 1, 3, id="r replaced w/1"), - pytest.param("rs_per_host", "rs_per_hostx", 1, 4, id="key typo: rs_per_hostx gets default"), - pytest.param("rs_per_host", "rx", 1, 5, id="key typo: rx gets default"), - ] + "exp_run_arg_key,run_arg_key,exp_value,test_value", + [ + pytest.param("rs_per_host", "rs_per_host", 1, 1, id="rs_per_host is 1"), + pytest.param("r", "r", 1, 1, id="r is 1"), + pytest.param("rs_per_host", "rs_per_host", 1, 2, id="rs_per_host replaced w/1"), + pytest.param("r", "r", 1, 3, id="r replaced w/1"), + pytest.param( + "rs_per_host", + "rs_per_hostx", + 1, + 4, + id="key typo: rs_per_hostx gets default", + ), + pytest.param("rs_per_host", "rx", 1, 5, id="key typo: rx gets default"), + ], ) -def test_jsrun_prep_rs_per_host(fileutils, coloutils, monkeypatch, exp_run_arg_key, run_arg_key, exp_value, test_value): +def test_jsrun_prep_rs_per_host( + fileutils, + coloutils, + monkeypatch, + exp_run_arg_key, + run_arg_key, + exp_value, + test_value, +): """Ensure that resource-per-host settings are configured and are modified as required to meet limitations (e.g. rs_per_host MUST equal 1)""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") - db_args = {"custom_pinning":[1]} - db_type = 'tcp' # Test is insensitive to choice of db + db_args = {"custom_pinning": [1]} + db_type = "tcp" # Test is insensitive to choice of db exp = Experiment("colocated_model_lsf", launcher="lsf") @@ -222,8 +277,9 @@ def test_jsrun_prep_rs_per_host(fileutils, coloutils, monkeypatch, exp_run_arg_k fileutils, db_type, exp, + "send_data_local_smartredis.py", db_args, - colo_settings = run_settings, + colo_settings=run_settings, ) # NOTE: _prep_colocated_db sets this to a string & not an integer diff --git a/tests/test_config.py b/tests/test_config.py index 5e28202b1..bbbb54526 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -25,17 +25,18 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os +import typing as t from pathlib import Path -import os -from re import A, L import pytest -import shutil -import typing as t from smartsim._core.config.config import Config from smartsim.error import SSConfigError +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + def test_all_config_defaults(): config = Config() @@ -54,8 +55,10 @@ def test_all_config_defaults(): config.test_device -def get_redisai_env(rai_path: t.Optional[str], lib_path: t.Optional[str]) -> t.Dict[str, str]: - """Convenience method to create a set of environment variables +def get_redisai_env( + rai_path: t.Optional[str], lib_path: t.Optional[str] +) -> t.Dict[str, str]: + """Convenience method to create a set of environment variables that include RedisAI-specific variables :param rai_path: The path to the RedisAI library :type: str (optional) @@ -68,21 +71,26 @@ def get_redisai_env(rai_path: t.Optional[str], lib_path: t.Optional[str]) -> t.D env["RAI_PATH"] = rai_path else: env.pop("RAI_PATH", None) - + if lib_path is not None: env["SMARTSIM_DEP_INSTALL_PATH"] = lib_path else: env.pop("SMARTSIM_DEP_INSTALL_PATH", None) - + return env -def test_redisai_invalid_rai_path(fileutils, monkeypatch): +def make_file(filepath: str) -> None: + os.makedirs(os.path.dirname(filepath)) + with open(filepath, "w+", encoding="utf-8") as dummy_file: + dummy_file.write("dummy\n") + + +def test_redisai_invalid_rai_path(test_dir, monkeypatch): """An invalid RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should fail""" - test_dir = fileutils.make_test_dir() - rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") - _ = fileutils.make_test_file("redisai.so", "lib") + rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") + make_file(os.path.join(test_dir, "lib", "redisai.so")) env = get_redisai_env(rai_file_path, test_dir) monkeypatch.setattr(os, "environ", env) @@ -92,14 +100,14 @@ def test_redisai_invalid_rai_path(fileutils, monkeypatch): with pytest.raises(SSConfigError) as ex: _ = config.redisai - assert 'RedisAI dependency not found' in ex.value.args[0] + assert "RedisAI dependency not found" in ex.value.args[0] -def test_redisai_valid_rai_path(fileutils, monkeypatch): +def test_redisai_valid_rai_path(test_dir, monkeypatch): """A valid RAI_PATH should override valid SMARTSIM_DEP_INSTALL_PATH and succeed""" - test_dir = fileutils.make_test_dir() - rai_file_path = fileutils.make_test_file("mock-redisai.so", "lib") - _ = fileutils.make_test_file("redisai.so", "deps") + + rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") + make_file(rai_file_path) env = get_redisai_env(rai_file_path, test_dir) monkeypatch.setattr(os, "environ", env) @@ -110,45 +118,45 @@ def test_redisai_valid_rai_path(fileutils, monkeypatch): assert config.redisai == rai_file_path -def test_redisai_invalid_lib_path(fileutils, monkeypatch): +def test_redisai_invalid_lib_path(test_dir, monkeypatch): """Invalid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should fail""" - test_dir = fileutils.make_test_dir() + rai_file_path = f"{test_dir}/railib/redisai.so" env = get_redisai_env(rai_file_path, test_dir) monkeypatch.setattr(os, "environ", env) - + config = Config() # Fail when no files exist @ either location with pytest.raises(SSConfigError) as ex: _ = config.redisai - assert 'RedisAI dependency not found' in ex.value.args[0] + assert "RedisAI dependency not found" in ex.value.args[0] -def test_redisai_valid_lib_path(fileutils, monkeypatch): +def test_redisai_valid_lib_path(test_dir, monkeypatch): """Valid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should succeed""" - test_dir = fileutils.make_test_dir() - rai_file_path = fileutils.make_test_file("mock-redisai.so", "lib") + rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") + make_file(rai_file_path) env = get_redisai_env(rai_file_path, test_dir) monkeypatch.setattr(os, "environ", env) - + config = Config() assert config.redisai assert Path(config.redisai).is_file() assert config.redisai == rai_file_path -def test_redisai_valid_lib_path_null_rai(fileutils, monkeypatch): +def test_redisai_valid_lib_path_null_rai(test_dir, monkeypatch): """Missing RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should succeed""" - test_dir = fileutils.make_test_dir() - rai_file_path: t.Optional[str] = None - lib_file_path = fileutils.make_test_file("redisai.so", "lib") + rai_file_path: t.Optional[str] = None + lib_file_path = os.path.join(test_dir, "lib", "redisai.so") + make_file(lib_file_path) env = get_redisai_env(rai_file_path, test_dir) monkeypatch.setattr(os, "environ", env) - + config = Config() assert config.redisai assert Path(config.redisai).is_file() @@ -189,3 +197,61 @@ def test_redis_cli(): with pytest.raises(SSConfigError): config.database_cli os.environ.pop("REDIS_CLI_PATH") + + +@pytest.mark.parametrize( + "value, exp_result", + [ + pytest.param("0", False, id="letter zero"), + pytest.param("1", True, id="letter one"), + pytest.param("-1", False, id="letter negative one"), + pytest.param(None, False, id="not in env"), + ], +) +def test_telemetry_flag( + monkeypatch: pytest.MonkeyPatch, value: t.Optional[str], exp_result: bool +): + if value is not None: + monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", value) + else: + monkeypatch.delenv("SMARTSIM_FLAG_TELEMETRY", raising=False) + config = Config() + assert config.telemetry_enabled == exp_result + + +@pytest.mark.parametrize( + "value, exp_result", + [ + pytest.param("1", 1, id="1"), + pytest.param("123", 123, id="123"), + pytest.param(None, 5, id="not in env"), + ], +) +def test_telemetry_frequency( + monkeypatch: pytest.MonkeyPatch, value: t.Optional[str], exp_result: int +): + if value is not None: + monkeypatch.setenv("SMARTSIM_TELEMETRY_FREQUENCY", value) + else: + monkeypatch.delenv("SMARTSIM_TELEMETRY_FREQUENCY", raising=False) + config = Config() + assert config.telemetry_frequency == exp_result + + +@pytest.mark.parametrize( + "value, exp_result", + [ + pytest.param("30", 30, id="30"), + pytest.param("123", 123, id="123"), + pytest.param(None, 90, id="not in env"), + ], +) +def test_telemetry_cooldown( + monkeypatch: pytest.MonkeyPatch, value: t.Optional[str], exp_result: bool +): + if value is not None: + monkeypatch.setenv("SMARTSIM_TELEMETRY_COOLDOWN", value) + else: + monkeypatch.delenv("SMARTSIM_TELEMETRY_COOLDOWN", raising=False) + config = Config() + assert config.telemetry_cooldown == exp_result diff --git a/tests/test_configs/bad.py b/tests/test_configs/bad.py index 8eff7e2cb..93e4864ff 100644 --- a/tests/test_configs/bad.py +++ b/tests/test_configs/bad.py @@ -35,7 +35,6 @@ def divide_by_zero(time_to_wait): if __name__ == "__main__": - parser = argparse.ArgumentParser() parser.add_argument("--time", type=int, default=0) args = parser.parse_args() diff --git a/tests/test_configs/check_env.py b/tests/test_configs/check_env.py index 72a2107ea..fd706ee83 100644 --- a/tests/test_configs/check_env.py +++ b/tests/test_configs/check_env.py @@ -8,4 +8,4 @@ print(f"{var_name}=={env_value}") sys.exit(0) -print('env var not found') +print("env var not found") diff --git a/tests/test_configs/cov/cobalt_cov.cfg b/tests/test_configs/cov/cobalt_cov.cfg index 84e727b68..1d02e9f52 100644 --- a/tests/test_configs/cov/cobalt_cov.cfg +++ b/tests/test_configs/cov/cobalt_cov.cfg @@ -24,6 +24,9 @@ exclude_lines= raise AssertionError raise NotImplementedError + # Don't complain about type-checking only blocks + if (t(yping)?\.)?TYPE_CHECKING: + # Don't complain if non-runnable code isn't run: if 0: if __name__ == .__main__.: diff --git a/tests/test_configs/cov/local_cov.cfg b/tests/test_configs/cov/local_cov.cfg index cb922f98e..d25e9f83a 100644 --- a/tests/test_configs/cov/local_cov.cfg +++ b/tests/test_configs/cov/local_cov.cfg @@ -6,6 +6,7 @@ omit = *mpirun* *alps* *lsf* + *pals* *redis_starter.py* */_cli/* */_install/* @@ -25,6 +26,9 @@ exclude_lines= raise AssertionError raise NotImplementedError + # Don't complain about type-checking only blocks + if (t(yping)?\.)?TYPE_CHECKING: + # Don't complain if non-runnable code isn't run: if 0: if __name__ == .__main__.: @@ -47,3 +51,4 @@ exclude_lines= launcher == "pbs" launcher == "cobalt" launcher == "lsf" + launcher == "pals" diff --git a/tests/test_configs/cov/lsf_cov.cfg b/tests/test_configs/cov/lsf_cov.cfg index fb0cb033e..6e5f52eb4 100644 --- a/tests/test_configs/cov/lsf_cov.cfg +++ b/tests/test_configs/cov/lsf_cov.cfg @@ -24,6 +24,9 @@ exclude_lines= raise AssertionError raise NotImplementedError + # Don't complain about type-checking only blocks + if (t(yping)?\.)?TYPE_CHECKING: + # Don't complain if non-runnable code isn't run: if 0: if __name__ == .__main__.: diff --git a/tests/test_configs/cov/pbs_cov.cfg b/tests/test_configs/cov/pbs_cov.cfg index 275c46454..99e7bcfd6 100644 --- a/tests/test_configs/cov/pbs_cov.cfg +++ b/tests/test_configs/cov/pbs_cov.cfg @@ -25,6 +25,9 @@ exclude_lines= raise AssertionError raise NotImplementedError + # Don't complain about type-checking only blocks + if (t(yping)?\.)?TYPE_CHECKING: + # Don't complain if non-runnable code isn't run: if 0: if __name__ == .__main__.: diff --git a/tests/test_configs/cov/slurm_cov.cfg b/tests/test_configs/cov/slurm_cov.cfg index edbafcda9..59405bc35 100644 --- a/tests/test_configs/cov/slurm_cov.cfg +++ b/tests/test_configs/cov/slurm_cov.cfg @@ -25,6 +25,9 @@ exclude_lines= raise AssertionError raise NotImplementedError + # Don't complain about type-checking only blocks + if (t(yping)?\.)?TYPE_CHECKING: + # Don't complain if non-runnable code isn't run: if 0: if __name__ == .__main__.: diff --git a/tests/test_configs/echo.py b/tests/test_configs/echo.py new file mode 100644 index 000000000..6523f4e4f --- /dev/null +++ b/tests/test_configs/echo.py @@ -0,0 +1,42 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import time + + +def echo(message: str, sleep_time: int): + if sleep_time > 0: + time.sleep(sleep_time) + print(f"Echoing: {message}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--message", type=str, default="Lorem ipsum") + parser.add_argument("--sleep_time", type=int, default=0) + args = parser.parse_args() + echo(args.message, args.sleep_time) diff --git a/tests/test_configs/h2o.inp b/tests/test_configs/generator_files/circular_config/h2o.inp similarity index 100% rename from tests/test_configs/h2o.inp rename to tests/test_configs/generator_files/circular_config/h2o.inp diff --git a/tests/test_configs/circular_config/sub_dir/circle b/tests/test_configs/generator_files/circular_config/sub_dir/circle similarity index 100% rename from tests/test_configs/circular_config/sub_dir/circle rename to tests/test_configs/generator_files/circular_config/sub_dir/circle diff --git a/tests/test_configs/circular_config/sub_dir/hello.sh b/tests/test_configs/generator_files/circular_config/sub_dir/hello.sh similarity index 100% rename from tests/test_configs/circular_config/sub_dir/hello.sh rename to tests/test_configs/generator_files/circular_config/sub_dir/hello.sh diff --git a/tests/test_configs/easy/correct/MOM_input b/tests/test_configs/generator_files/easy/correct/MOM_input similarity index 100% rename from tests/test_configs/easy/correct/MOM_input rename to tests/test_configs/generator_files/easy/correct/MOM_input diff --git a/tests/test_configs/easy/correct/example_input.i b/tests/test_configs/generator_files/easy/correct/example_input.i similarity index 100% rename from tests/test_configs/easy/correct/example_input.i rename to tests/test_configs/generator_files/easy/correct/example_input.i diff --git a/tests/test_configs/easy/correct/in.airebo b/tests/test_configs/generator_files/easy/correct/in.airebo similarity index 100% rename from tests/test_configs/easy/correct/in.airebo rename to tests/test_configs/generator_files/easy/correct/in.airebo diff --git a/tests/test_configs/easy/correct/in.atm b/tests/test_configs/generator_files/easy/correct/in.atm similarity index 100% rename from tests/test_configs/easy/correct/in.atm rename to tests/test_configs/generator_files/easy/correct/in.atm diff --git a/tests/test_configs/easy/correct/in.crack b/tests/test_configs/generator_files/easy/correct/in.crack similarity index 100% rename from tests/test_configs/easy/correct/in.crack rename to tests/test_configs/generator_files/easy/correct/in.crack diff --git a/tests/test_configs/easy/correct/in.ellipse.gayberne b/tests/test_configs/generator_files/easy/correct/in.ellipse.gayberne similarity index 100% rename from tests/test_configs/easy/correct/in.ellipse.gayberne rename to tests/test_configs/generator_files/easy/correct/in.ellipse.gayberne diff --git a/tests/test_configs/easy/correct/input-file.inp b/tests/test_configs/generator_files/easy/correct/input-file.inp similarity index 100% rename from tests/test_configs/easy/correct/input-file.inp rename to tests/test_configs/generator_files/easy/correct/input-file.inp diff --git a/tests/test_configs/easy/correct/input.nml b/tests/test_configs/generator_files/easy/correct/input.nml similarity index 100% rename from tests/test_configs/easy/correct/input.nml rename to tests/test_configs/generator_files/easy/correct/input.nml diff --git a/tests/test_configs/easy/correct/simple-H20.xml b/tests/test_configs/generator_files/easy/correct/simple-H20.xml similarity index 100% rename from tests/test_configs/easy/correct/simple-H20.xml rename to tests/test_configs/generator_files/easy/correct/simple-H20.xml diff --git a/tests/test_configs/easy/marked/MOM_input b/tests/test_configs/generator_files/easy/marked/MOM_input similarity index 100% rename from tests/test_configs/easy/marked/MOM_input rename to tests/test_configs/generator_files/easy/marked/MOM_input diff --git a/tests/test_configs/easy/marked/example_input.i b/tests/test_configs/generator_files/easy/marked/example_input.i similarity index 100% rename from tests/test_configs/easy/marked/example_input.i rename to tests/test_configs/generator_files/easy/marked/example_input.i diff --git a/tests/test_configs/easy/marked/in.airebo b/tests/test_configs/generator_files/easy/marked/in.airebo similarity index 100% rename from tests/test_configs/easy/marked/in.airebo rename to tests/test_configs/generator_files/easy/marked/in.airebo diff --git a/tests/test_configs/easy/marked/in.atm b/tests/test_configs/generator_files/easy/marked/in.atm similarity index 100% rename from tests/test_configs/easy/marked/in.atm rename to tests/test_configs/generator_files/easy/marked/in.atm diff --git a/tests/test_configs/easy/marked/in.crack b/tests/test_configs/generator_files/easy/marked/in.crack similarity index 100% rename from tests/test_configs/easy/marked/in.crack rename to tests/test_configs/generator_files/easy/marked/in.crack diff --git a/tests/test_configs/easy/marked/in.ellipse.gayberne b/tests/test_configs/generator_files/easy/marked/in.ellipse.gayberne similarity index 100% rename from tests/test_configs/easy/marked/in.ellipse.gayberne rename to tests/test_configs/generator_files/easy/marked/in.ellipse.gayberne diff --git a/tests/test_configs/easy/marked/input-file.inp b/tests/test_configs/generator_files/easy/marked/input-file.inp similarity index 100% rename from tests/test_configs/easy/marked/input-file.inp rename to tests/test_configs/generator_files/easy/marked/input-file.inp diff --git a/tests/test_configs/easy/marked/input.nml b/tests/test_configs/generator_files/easy/marked/input.nml similarity index 100% rename from tests/test_configs/easy/marked/input.nml rename to tests/test_configs/generator_files/easy/marked/input.nml diff --git a/tests/test_configs/easy/marked/simple-H20.xml b/tests/test_configs/generator_files/easy/marked/simple-H20.xml similarity index 100% rename from tests/test_configs/easy/marked/simple-H20.xml rename to tests/test_configs/generator_files/easy/marked/simple-H20.xml diff --git a/tests/test_configs/in.atm b/tests/test_configs/generator_files/in.atm similarity index 100% rename from tests/test_configs/in.atm rename to tests/test_configs/generator_files/in.atm diff --git a/tests/test_configs/generator_files/log_params/dir_test/dir_test_0/smartsim_params.txt b/tests/test_configs/generator_files/log_params/dir_test/dir_test_0/smartsim_params.txt new file mode 100644 index 000000000..373cec87e --- /dev/null +++ b/tests/test_configs/generator_files/log_params/dir_test/dir_test_0/smartsim_params.txt @@ -0,0 +1,8 @@ +Model name: dir_test_0 +File name Parameters +-------------------------- --------------- +dir_test/dir_test_0/in.atm Name Value + ------ ------- + THERMO 10 + STEPS 10 + diff --git a/tests/test_configs/generator_files/log_params/dir_test/dir_test_1/smartsim_params.txt b/tests/test_configs/generator_files/log_params/dir_test/dir_test_1/smartsim_params.txt new file mode 100644 index 000000000..e45ebb6bf --- /dev/null +++ b/tests/test_configs/generator_files/log_params/dir_test/dir_test_1/smartsim_params.txt @@ -0,0 +1,8 @@ +Model name: dir_test_1 +File name Parameters +-------------------------- --------------- +dir_test/dir_test_1/in.atm Name Value + ------ ------- + THERMO 10 + STEPS 20 + diff --git a/tests/test_configs/generator_files/log_params/dir_test/dir_test_2/smartsim_params.txt b/tests/test_configs/generator_files/log_params/dir_test/dir_test_2/smartsim_params.txt new file mode 100644 index 000000000..081dc56c6 --- /dev/null +++ b/tests/test_configs/generator_files/log_params/dir_test/dir_test_2/smartsim_params.txt @@ -0,0 +1,8 @@ +Model name: dir_test_2 +File name Parameters +-------------------------- --------------- +dir_test/dir_test_2/in.atm Name Value + ------ ------- + THERMO 20 + STEPS 10 + diff --git a/tests/test_configs/generator_files/log_params/dir_test/dir_test_3/smartsim_params.txt b/tests/test_configs/generator_files/log_params/dir_test/dir_test_3/smartsim_params.txt new file mode 100644 index 000000000..3403f7c71 --- /dev/null +++ b/tests/test_configs/generator_files/log_params/dir_test/dir_test_3/smartsim_params.txt @@ -0,0 +1,8 @@ +Model name: dir_test_3 +File name Parameters +-------------------------- --------------- +dir_test/dir_test_3/in.atm Name Value + ------ ------- + THERMO 20 + STEPS 20 + diff --git a/tests/test_configs/generator_files/log_params/smartsim_params.txt b/tests/test_configs/generator_files/log_params/smartsim_params.txt new file mode 100644 index 000000000..6ac92049f --- /dev/null +++ b/tests/test_configs/generator_files/log_params/smartsim_params.txt @@ -0,0 +1,32 @@ +Generation start date and time: 08/09/2023 18:22:44 +Model name: dir_test_0 +File name Parameters +-------------------------- --------------- +dir_test/dir_test_0/in.atm Name Value + ------ ------- + THERMO 10 + STEPS 10 + +Model name: dir_test_1 +File name Parameters +-------------------------- --------------- +dir_test/dir_test_1/in.atm Name Value + ------ ------- + THERMO 10 + STEPS 20 + +Model name: dir_test_2 +File name Parameters +-------------------------- --------------- +dir_test/dir_test_2/in.atm Name Value + ------ ------- + THERMO 20 + STEPS 10 + +Model name: dir_test_3 +File name Parameters +-------------------------- --------------- +dir_test/dir_test_3/in.atm Name Value + ------ ------- + THERMO 20 + STEPS 20 diff --git a/tests/test_configs/med/correct/MOM_input b/tests/test_configs/generator_files/med/correct/MOM_input similarity index 100% rename from tests/test_configs/med/correct/MOM_input rename to tests/test_configs/generator_files/med/correct/MOM_input diff --git a/tests/test_configs/med/correct/diag_table b/tests/test_configs/generator_files/med/correct/diag_table similarity index 100% rename from tests/test_configs/med/correct/diag_table rename to tests/test_configs/generator_files/med/correct/diag_table diff --git a/tests/test_configs/med/correct/example_input.i b/tests/test_configs/generator_files/med/correct/example_input.i similarity index 100% rename from tests/test_configs/med/correct/example_input.i rename to tests/test_configs/generator_files/med/correct/example_input.i diff --git a/tests/test_configs/med/correct/in.airebo b/tests/test_configs/generator_files/med/correct/in.airebo similarity index 100% rename from tests/test_configs/med/correct/in.airebo rename to tests/test_configs/generator_files/med/correct/in.airebo diff --git a/tests/test_configs/med/correct/in.atm b/tests/test_configs/generator_files/med/correct/in.atm similarity index 100% rename from tests/test_configs/med/correct/in.atm rename to tests/test_configs/generator_files/med/correct/in.atm diff --git a/tests/test_configs/med/correct/in.crack b/tests/test_configs/generator_files/med/correct/in.crack similarity index 100% rename from tests/test_configs/med/correct/in.crack rename to tests/test_configs/generator_files/med/correct/in.crack diff --git a/tests/test_configs/med/correct/in.ellipse.gayberne b/tests/test_configs/generator_files/med/correct/in.ellipse.gayberne similarity index 100% rename from tests/test_configs/med/correct/in.ellipse.gayberne rename to tests/test_configs/generator_files/med/correct/in.ellipse.gayberne diff --git a/tests/test_configs/med/correct/input-file.inp b/tests/test_configs/generator_files/med/correct/input-file.inp similarity index 100% rename from tests/test_configs/med/correct/input-file.inp rename to tests/test_configs/generator_files/med/correct/input-file.inp diff --git a/tests/test_configs/med/correct/input.nml b/tests/test_configs/generator_files/med/correct/input.nml similarity index 100% rename from tests/test_configs/med/correct/input.nml rename to tests/test_configs/generator_files/med/correct/input.nml diff --git a/tests/test_configs/med/correct/simple-H20.xml b/tests/test_configs/generator_files/med/correct/simple-H20.xml similarity index 100% rename from tests/test_configs/med/correct/simple-H20.xml rename to tests/test_configs/generator_files/med/correct/simple-H20.xml diff --git a/tests/test_configs/med/marked/MOM_input b/tests/test_configs/generator_files/med/marked/MOM_input similarity index 100% rename from tests/test_configs/med/marked/MOM_input rename to tests/test_configs/generator_files/med/marked/MOM_input diff --git a/tests/test_configs/med/marked/diag_table b/tests/test_configs/generator_files/med/marked/diag_table similarity index 100% rename from tests/test_configs/med/marked/diag_table rename to tests/test_configs/generator_files/med/marked/diag_table diff --git a/tests/test_configs/med/marked/example_input.i b/tests/test_configs/generator_files/med/marked/example_input.i similarity index 100% rename from tests/test_configs/med/marked/example_input.i rename to tests/test_configs/generator_files/med/marked/example_input.i diff --git a/tests/test_configs/med/marked/in.airebo b/tests/test_configs/generator_files/med/marked/in.airebo similarity index 100% rename from tests/test_configs/med/marked/in.airebo rename to tests/test_configs/generator_files/med/marked/in.airebo diff --git a/tests/test_configs/med/marked/in.atm b/tests/test_configs/generator_files/med/marked/in.atm similarity index 100% rename from tests/test_configs/med/marked/in.atm rename to tests/test_configs/generator_files/med/marked/in.atm diff --git a/tests/test_configs/med/marked/in.crack b/tests/test_configs/generator_files/med/marked/in.crack similarity index 100% rename from tests/test_configs/med/marked/in.crack rename to tests/test_configs/generator_files/med/marked/in.crack diff --git a/tests/test_configs/med/marked/in.ellipse.gayberne b/tests/test_configs/generator_files/med/marked/in.ellipse.gayberne similarity index 100% rename from tests/test_configs/med/marked/in.ellipse.gayberne rename to tests/test_configs/generator_files/med/marked/in.ellipse.gayberne diff --git a/tests/test_configs/med/marked/input-file.inp b/tests/test_configs/generator_files/med/marked/input-file.inp similarity index 100% rename from tests/test_configs/med/marked/input-file.inp rename to tests/test_configs/generator_files/med/marked/input-file.inp diff --git a/tests/test_configs/med/marked/input.nml b/tests/test_configs/generator_files/med/marked/input.nml similarity index 100% rename from tests/test_configs/med/marked/input.nml rename to tests/test_configs/generator_files/med/marked/input.nml diff --git a/tests/test_configs/med/marked/simple-H20.xml b/tests/test_configs/generator_files/med/marked/simple-H20.xml similarity index 100% rename from tests/test_configs/med/marked/simple-H20.xml rename to tests/test_configs/generator_files/med/marked/simple-H20.xml diff --git a/tests/test_configs/multi_tags_template.sh b/tests/test_configs/generator_files/multi_tags_template.sh similarity index 100% rename from tests/test_configs/multi_tags_template.sh rename to tests/test_configs/generator_files/multi_tags_template.sh diff --git a/tests/test_configs/new-tag/correct/MOM_input b/tests/test_configs/generator_files/new-tag/correct/MOM_input similarity index 100% rename from tests/test_configs/new-tag/correct/MOM_input rename to tests/test_configs/generator_files/new-tag/correct/MOM_input diff --git a/tests/test_configs/new-tag/correct/diag_table b/tests/test_configs/generator_files/new-tag/correct/diag_table similarity index 100% rename from tests/test_configs/new-tag/correct/diag_table rename to tests/test_configs/generator_files/new-tag/correct/diag_table diff --git a/tests/test_configs/new-tag/correct/example_input.i b/tests/test_configs/generator_files/new-tag/correct/example_input.i similarity index 100% rename from tests/test_configs/new-tag/correct/example_input.i rename to tests/test_configs/generator_files/new-tag/correct/example_input.i diff --git a/tests/test_configs/new-tag/correct/in.airebo b/tests/test_configs/generator_files/new-tag/correct/in.airebo similarity index 100% rename from tests/test_configs/new-tag/correct/in.airebo rename to tests/test_configs/generator_files/new-tag/correct/in.airebo diff --git a/tests/test_configs/new-tag/correct/in.atm b/tests/test_configs/generator_files/new-tag/correct/in.atm similarity index 100% rename from tests/test_configs/new-tag/correct/in.atm rename to tests/test_configs/generator_files/new-tag/correct/in.atm diff --git a/tests/test_configs/new-tag/correct/in.crack b/tests/test_configs/generator_files/new-tag/correct/in.crack similarity index 100% rename from tests/test_configs/new-tag/correct/in.crack rename to tests/test_configs/generator_files/new-tag/correct/in.crack diff --git a/tests/test_configs/new-tag/correct/in.ellipse.gayberne b/tests/test_configs/generator_files/new-tag/correct/in.ellipse.gayberne similarity index 100% rename from tests/test_configs/new-tag/correct/in.ellipse.gayberne rename to tests/test_configs/generator_files/new-tag/correct/in.ellipse.gayberne diff --git a/tests/test_configs/new-tag/correct/input-file.inp b/tests/test_configs/generator_files/new-tag/correct/input-file.inp similarity index 100% rename from tests/test_configs/new-tag/correct/input-file.inp rename to tests/test_configs/generator_files/new-tag/correct/input-file.inp diff --git a/tests/test_configs/new-tag/correct/input.nml b/tests/test_configs/generator_files/new-tag/correct/input.nml similarity index 100% rename from tests/test_configs/new-tag/correct/input.nml rename to tests/test_configs/generator_files/new-tag/correct/input.nml diff --git a/tests/test_configs/new-tag/correct/simple-H20.xml b/tests/test_configs/generator_files/new-tag/correct/simple-H20.xml similarity index 100% rename from tests/test_configs/new-tag/correct/simple-H20.xml rename to tests/test_configs/generator_files/new-tag/correct/simple-H20.xml diff --git a/tests/test_configs/new-tag/marked/MOM_input b/tests/test_configs/generator_files/new-tag/marked/MOM_input similarity index 100% rename from tests/test_configs/new-tag/marked/MOM_input rename to tests/test_configs/generator_files/new-tag/marked/MOM_input diff --git a/tests/test_configs/new-tag/marked/diag_table b/tests/test_configs/generator_files/new-tag/marked/diag_table similarity index 100% rename from tests/test_configs/new-tag/marked/diag_table rename to tests/test_configs/generator_files/new-tag/marked/diag_table diff --git a/tests/test_configs/new-tag/marked/example_input.i b/tests/test_configs/generator_files/new-tag/marked/example_input.i similarity index 100% rename from tests/test_configs/new-tag/marked/example_input.i rename to tests/test_configs/generator_files/new-tag/marked/example_input.i diff --git a/tests/test_configs/new-tag/marked/in.airebo b/tests/test_configs/generator_files/new-tag/marked/in.airebo similarity index 100% rename from tests/test_configs/new-tag/marked/in.airebo rename to tests/test_configs/generator_files/new-tag/marked/in.airebo diff --git a/tests/test_configs/new-tag/marked/in.atm b/tests/test_configs/generator_files/new-tag/marked/in.atm similarity index 100% rename from tests/test_configs/new-tag/marked/in.atm rename to tests/test_configs/generator_files/new-tag/marked/in.atm diff --git a/tests/test_configs/new-tag/marked/in.crack b/tests/test_configs/generator_files/new-tag/marked/in.crack similarity index 100% rename from tests/test_configs/new-tag/marked/in.crack rename to tests/test_configs/generator_files/new-tag/marked/in.crack diff --git a/tests/test_configs/new-tag/marked/in.ellipse.gayberne b/tests/test_configs/generator_files/new-tag/marked/in.ellipse.gayberne similarity index 100% rename from tests/test_configs/new-tag/marked/in.ellipse.gayberne rename to tests/test_configs/generator_files/new-tag/marked/in.ellipse.gayberne diff --git a/tests/test_configs/new-tag/marked/input-file.inp b/tests/test_configs/generator_files/new-tag/marked/input-file.inp similarity index 100% rename from tests/test_configs/new-tag/marked/input-file.inp rename to tests/test_configs/generator_files/new-tag/marked/input-file.inp diff --git a/tests/test_configs/new-tag/marked/input.nml b/tests/test_configs/generator_files/new-tag/marked/input.nml similarity index 100% rename from tests/test_configs/new-tag/marked/input.nml rename to tests/test_configs/generator_files/new-tag/marked/input.nml diff --git a/tests/test_configs/new-tag/marked/simple-H20.xml b/tests/test_configs/generator_files/new-tag/marked/simple-H20.xml similarity index 100% rename from tests/test_configs/new-tag/marked/simple-H20.xml rename to tests/test_configs/generator_files/new-tag/marked/simple-H20.xml diff --git a/tests/test_configs/tag_dir_template/nested_0/tagged_0.sh b/tests/test_configs/generator_files/tag_dir_template/nested_0/tagged_0.sh similarity index 100% rename from tests/test_configs/tag_dir_template/nested_0/tagged_0.sh rename to tests/test_configs/generator_files/tag_dir_template/nested_0/tagged_0.sh diff --git a/tests/test_configs/tag_dir_template/nested_1/tagged_1.sh b/tests/test_configs/generator_files/tag_dir_template/nested_1/tagged_1.sh similarity index 100% rename from tests/test_configs/tag_dir_template/nested_1/tagged_1.sh rename to tests/test_configs/generator_files/tag_dir_template/nested_1/tagged_1.sh diff --git a/tests/test_configs/generator_files/test_dir/test.in b/tests/test_configs/generator_files/test_dir/test.in new file mode 100644 index 000000000..8a0a76ee2 --- /dev/null +++ b/tests/test_configs/generator_files/test_dir/test.in @@ -0,0 +1,28 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +thermo = @THERMO@ +steps = @STEPS@ \ No newline at end of file diff --git a/tests/test_configs/test_dir/test_dir_1/config.txt b/tests/test_configs/generator_files/test_dir/test_dir_1/config.txt similarity index 100% rename from tests/test_configs/test_dir/test_dir_1/config.txt rename to tests/test_configs/generator_files/test_dir/test_dir_1/config.txt diff --git a/tests/test_configs/to_copy_dir/mock.txt b/tests/test_configs/generator_files/to_copy_dir/mock.txt similarity index 100% rename from tests/test_configs/to_copy_dir/mock.txt rename to tests/test_configs/generator_files/to_copy_dir/mock.txt diff --git a/tests/test_configs/to_symlink_dir/mock2.txt b/tests/test_configs/generator_files/to_symlink_dir/mock2.txt similarity index 100% rename from tests/test_configs/to_symlink_dir/mock2.txt rename to tests/test_configs/generator_files/to_symlink_dir/mock2.txt diff --git a/tests/test_configs/ml/training_service_torch.py b/tests/test_configs/ml/training_service_torch.py index 8c1091820..575940031 100644 --- a/tests/test_configs/ml/training_service_torch.py +++ b/tests/test_configs/ml/training_service_torch.py @@ -68,7 +68,6 @@ def forward(self, x): print("Started training") for epoch in range(1): # loop over the dataset multiple times - running_loss = 0.0 epoch_running_loss = 0.0 output_period = 1 diff --git a/tests/test_configs/printing_model.py b/tests/test_configs/printing_model.py new file mode 100644 index 000000000..a43f12a67 --- /dev/null +++ b/tests/test_configs/printing_model.py @@ -0,0 +1,18 @@ +import sys +import time + + +def main() -> int: + print(";START;") + time.sleep(20) + print(";MID;") + print("This is an error msg", file=sys.stderr) + time.sleep(20) + print(";END;") + + print("yay!!") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_configs/reconnect_node.py b/tests/test_configs/reconnect_node.py index dd07d2d47..3ff3d71ef 100644 --- a/tests/test_configs/reconnect_node.py +++ b/tests/test_configs/reconnect_node.py @@ -29,7 +29,6 @@ from smartsim import Client if __name__ == "__main__": - import argparse argparser = argparse.ArgumentParser() diff --git a/tests/test_configs/reconnect_sim.py b/tests/test_configs/reconnect_sim.py index a6340adf5..eda634517 100644 --- a/tests/test_configs/reconnect_sim.py +++ b/tests/test_configs/reconnect_sim.py @@ -31,14 +31,12 @@ def create_data(seed, size): - np.random.seed(seed) x = np.random.uniform(-15.0, 15.0, size=size) return x if __name__ == "__main__": - import argparse argparser = argparse.ArgumentParser() diff --git a/tests/test_configs/send_data_local_smartredis.py b/tests/test_configs/send_data_local_smartredis.py index 48acf0915..0c318736f 100644 --- a/tests/test_configs/send_data_local_smartredis.py +++ b/tests/test_configs/send_data_local_smartredis.py @@ -39,7 +39,7 @@ def main(): returned = client.get_tensor("test_array") np.testing.assert_array_equal(array, returned) - print(f"Test worked! Sent and received array: {str(array)}") + print(f"Test worked! Sent {str(array)} and received {str(returned)}") if __name__ == "__main__": diff --git a/tests/test_configs/sleep.py b/tests/test_configs/sleep.py index 7875e9a3a..778b8946e 100644 --- a/tests/test_configs/sleep.py +++ b/tests/test_configs/sleep.py @@ -36,7 +36,6 @@ def sleep(time_to_sleep): if __name__ == "__main__": - parser = argparse.ArgumentParser() parser.add_argument("--time", type=int, default=15) args = parser.parse_args() diff --git a/tests/test_configs/smartredis/consumer.py b/tests/test_configs/smartredis/consumer.py index 8c5c1528b..7833147c6 100644 --- a/tests/test_configs/smartredis/consumer.py +++ b/tests/test_configs/smartredis/consumer.py @@ -40,7 +40,6 @@ parser.add_argument("--exchange", action="store_true") args = parser.parse_args() - # get model and set into database c = Client(False) keyin = os.getenv("SSKEYIN") diff --git a/tests/test_configs/smartredis/dbid.py b/tests/test_configs/smartredis/dbid.py new file mode 100644 index 000000000..a0aa0993a --- /dev/null +++ b/tests/test_configs/smartredis/dbid.py @@ -0,0 +1,49 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import os + +from smartredis import Client, ConfigOptions + +if __name__ == "__main__": + """For inclusion in test with single database identifier in a single Client + constructor""" + + parser = argparse.ArgumentParser(description="SmartRedis") + parser.add_argument("--exchange", action="store_true") + args = parser.parse_args() + + env_vars = [ + "SSDB_testdb_colo", + "SR_DB_TYPE_testdb_colo", + ] + + assert all([var in os.environ for var in env_vars]) + + opts1 = ConfigOptions.create_from_environment("testdb_colo") + + client = Client(opts1, logger_name="SmartSim") diff --git a/tests/test_configs/smartredis/multidbid.py b/tests/test_configs/smartredis/multidbid.py new file mode 100644 index 000000000..24c8768ec --- /dev/null +++ b/tests/test_configs/smartredis/multidbid.py @@ -0,0 +1,52 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import os + +from smartredis import Client, ConfigOptions + +if __name__ == "__main__": + """For inclusion in test with two unique database identifiers with multiple databases""" + + parser = argparse.ArgumentParser(description="SmartRedis") + parser.add_argument("--exchange", action="store_true") + args = parser.parse_args() + + env_vars = [ + "SSDB_testdb_reg", + "SR_DB_TYPE_testdb_reg", + "SSDB_testdb_colo", + "SR_DB_TYPE_testdb_colo", + ] + + assert all([var in os.environ for var in env_vars]) + + opts1 = ConfigOptions.create_from_environment("testdb_reg") + opts2 = ConfigOptions.create_from_environment("testdb_colo") + + c1 = Client(opts1, logger_name="SmartSim") + c2 = Client(opts2, logger_name="SmartSim") diff --git a/tests/test_configs/telemetry/colocatedmodel.json b/tests/test_configs/telemetry/colocatedmodel.json new file mode 100644 index 000000000..f3e93ac76 --- /dev/null +++ b/tests/test_configs/telemetry/colocatedmodel.json @@ -0,0 +1,69 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "002816b", + "timestamp": 1699037041106269774, + "model": [ + { + "name": "colocated_model", + "path": "/tmp/my-exp/colocated_model", + "exe_args": [ + "/path/to/my/script.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": {} + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": { + "settings": { + "unix_socket": "/tmp/redis.socket", + "socket_permissions": 755, + "port": 0, + "cpus": 1, + "custom_pinning": "0", + "debug": false, + "db_identifier": "", + "rai_args": { + "threads_per_queue": null, + "inter_op_parallelism": null, + "intra_op_parallelism": null + }, + "extra_db_args": {} + }, + "scripts": [], + "models": [] + }, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_ensemble/002816b/model/colocated_model", + "step_id": "4139111.21", + "task_id": "21529", + "managed": true + }, + "out_file": "/tmp/my-exp/colocated_model/colocated_model.out", + "err_file": "/tmp/my-exp/colocated_model/colocated_model.err" + } + ], + "orchestrator": [], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/db_and_model.json b/tests/test_configs/telemetry/db_and_model.json new file mode 100644 index 000000000..58c1c841a --- /dev/null +++ b/tests/test_configs/telemetry/db_and_model.json @@ -0,0 +1,86 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "2ca19ad", + "timestamp": 1699038647234488933, + "model": [], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_0", + "hostname": "10.128.0.4", + "port": 6780, + "cluster": false, + "conf_file": null, + "out_file": "/path/to/some/file.out", + "err_file": "/path/to/some/file.err", + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", + "step_id": "4139111.27", + "task_id": "1452", + "managed": true + } + } + ] + } + ], + "ensemble": [] + }, + { + "run_id": "4b5507a", + "timestamp": 1699038661491043211, + "model": [ + { + "name": "perroquet", + "path": "/tmp/my-exp/perroquet", + "exe_args": [ + "/path/to/my/script.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", + "step_id": "4139111.28", + "task_id": "2929", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet/perroquet.out", + "err_file": "/tmp/my-exp/perroquet/perroquet.err" + } + ], + "orchestrator": [], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/db_and_model_1run.json b/tests/test_configs/telemetry/db_and_model_1run.json new file mode 100644 index 000000000..44e32bfe4 --- /dev/null +++ b/tests/test_configs/telemetry/db_and_model_1run.json @@ -0,0 +1,79 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "4b5507a", + "timestamp": 1699038661491043211, + "model": [ + { + "name": "perroquet", + "path": "/tmp/my-exp/perroquet", + "exe_args": [ + "/path/to/my/script.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", + "step_id": "4139111.28", + "task_id": "2929", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet/perroquet.out", + "err_file": "/tmp/my-exp/perroquet/perroquet.err" + } + ], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_0", + "hostname": "10.128.0.4", + "port": 6780, + "cluster": false, + "conf_file": null, + "out_file": "/path/to/some/file.out", + "err_file": "/path/to/some/file.err", + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", + "step_id": "4139111.27", + "task_id": "1452", + "managed": true + } + } + ] + } + ], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/ensembles.json b/tests/test_configs/telemetry/ensembles.json new file mode 100644 index 000000000..841324ec6 --- /dev/null +++ b/tests/test_configs/telemetry/ensembles.json @@ -0,0 +1,329 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/home/someuser/code/ss/my-exp", + "launcher": "Local" + }, + "runs": [ + { + "run_id": "d041b90", + "timestamp": 1698679830384608928, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" + ], + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_0", + "step_id": null, + "task_id": "88118", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_0.out", + "err_file": "/home/someuser/code/ss/my-ens_0.err" + }, + { + "name": "my-ens_1", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_1", + "step_id": null, + "task_id": "88131", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_1.out", + "err_file": "/home/someuser/code/ss/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_2", + "step_id": null, + "task_id": "88146", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_2.out", + "err_file": "/home/someuser/code/ss/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_3", + "step_id": null, + "task_id": "88170", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_3.out", + "err_file": "/home/someuser/code/ss/my-ens_3.err" + }, + { + "name": "my-ens_4", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_4", + "step_id": null, + "task_id": "88178", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_4.out", + "err_file": "/home/someuser/code/ss/my-ens_4.err" + }, + { + "name": "my-ens_5", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_5", + "step_id": null, + "task_id": "88193", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_5.out", + "err_file": "/home/someuser/code/ss/my-ens_5.err" + }, + { + "name": "my-ens_6", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_6", + "step_id": null, + "task_id": "88221", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_6.out", + "err_file": "/home/someuser/code/ss/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_7", + "step_id": null, + "task_id": "88241", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_7.out", + "err_file": "/home/someuser/code/ss/my-ens_7.err" + } + ] + } + ] + } + ] + } diff --git a/tests/test_configs/telemetry/serialmodels.json b/tests/test_configs/telemetry/serialmodels.json new file mode 100644 index 000000000..40337eceb --- /dev/null +++ b/tests/test_configs/telemetry/serialmodels.json @@ -0,0 +1,186 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "8c0fbb1", + "timestamp": 1699037881502730708, + "model": [ + { + "name": "perroquet_0", + "path": "/tmp/my-exp/perroquet_0", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_0", + "step_id": "4139111.22", + "task_id": "17966", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_0/perroquet_0.out", + "err_file": "/tmp/my-exp/perroquet_0/perroquet_0.err" + }, + { + "name": "perroquet_1", + "path": "/tmp/my-exp/perroquet_1", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_1", + "step_id": "4139111.23", + "task_id": "18100", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_1/perroquet_1.out", + "err_file": "/tmp/my-exp/perroquet_1/perroquet_1.err" + }, + { + "name": "perroquet_2", + "path": "/tmp/my-exp/perroquet_2", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_2", + "step_id": "4139111.24", + "task_id": "18159", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_2/perroquet_2.out", + "err_file": "/tmp/my-exp/perroquet_2/perroquet_2.err" + }, + { + "name": "perroquet_3", + "path": "/tmp/my-exp/perroquet_3", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_3", + "step_id": "4139111.25", + "task_id": "18499", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_3/perroquet_3.out", + "err_file": "/tmp/my-exp/perroquet_3/perroquet_3.err" + }, + { + "name": "perroquet_4", + "path": "/tmp/my-exp/perroquet_4", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_4", + "step_id": "4139111.26", + "task_id": "18832", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_4/perroquet_4.out", + "err_file": "/tmp/my-exp/perroquet_4/perroquet_4.err" + } + ], + "orchestrator": [], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/telemetry.json b/tests/test_configs/telemetry/telemetry.json new file mode 100644 index 000000000..a380bc5fb --- /dev/null +++ b/tests/test_configs/telemetry/telemetry.json @@ -0,0 +1,946 @@ +{ + "experiment": { + "name": "my-exp", + "path": "/path/to/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "d999ad89-020f-4e6a-b834-dbd88658ce84", + "timestamp": 1697824072792854287, + "model": [ + { + "name": "my-model", + "path": "/path/to/my-exp/my-model", + "exe_args": [ + "hello", + "world" + ], + "run_settings": { + "exe": [ + "/usr/bin/echo" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": { + "settings": { + "port": 5757, + "ifname": "lo", + "cpus": 1, + "custom_pinning": "0", + "debug": false, + "db_identifier": "COLO", + "rai_args": { + "threads_per_queue": null, + "inter_op_parallelism": null, + "intra_op_parallelism": null + }, + "extra_db_args": {} + }, + "scripts": [], + "models": [ + { + "cnn": { + "backend": "TORCH", + "device": "CPU" + } + } + ] + }, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d999ad89-020f-4e6a-b834-dbd88658ce84/model/my-model", + "step_id": "4121050.30", + "task_id": "25230", + "managed": true + }, + "out_file": "/path/to/my-exp/my-model/my-model.out", + "err_file": "/path/to/my-exp/my-model/my-model.err" + } + ], + "orchestrator": [], + "ensemble": [] + }, + { + "run_id": "fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa", + "timestamp": 1697824102122439975, + "model": [], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_1", + "hostname": "10.128.0.70", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_1-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } + }, + { + "name": "orchestrator_2", + "hostname": "10.128.0.71", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_2-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } + }, + { + "name": "orchestrator_0", + "hostname": "10.128.0.69", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_0-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } + } + ] + } + ], + "ensemble": [] + }, + { + "run_id": "d65ae1df-cb5e-45d9-ab09-6fa641755997", + "timestamp": 1697824127962219505, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" + ], + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/path/to/my-exp/my-ens/my-ens_0", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_0", + "step_id": "4121050.32", + "task_id": "25639", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" + }, + { + "name": "my-ens_1", + "path": "/path/to/my-exp/my-ens/my-ens_1", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_1", + "step_id": "4121050.33", + "task_id": "25768", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/path/to/my-exp/my-ens/my-ens_2", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_2", + "step_id": "4121050.34", + "task_id": "25817", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/path/to/my-exp/my-ens/my-ens_3", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_3", + "step_id": "4121050.35", + "task_id": "25837", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" + }, + { + "name": "my-ens_4", + "path": "/path/to/my-exp/my-ens/my-ens_4", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_4", + "step_id": "4121050.36", + "task_id": "25872", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" + }, + { + "name": "my-ens_5", + "path": "/path/to/my-exp/my-ens/my-ens_5", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_5", + "step_id": "4121050.37", + "task_id": "25930", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" + }, + { + "name": "my-ens_6", + "path": "/path/to/my-exp/my-ens/my-ens_6", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_6", + "step_id": "4121050.38", + "task_id": "25945", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/path/to/my-exp/my-ens/my-ens_7", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_7", + "step_id": "4121050.39", + "task_id": "25967", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" + } + ] + } + ] + }, + { + "run_id": "e41f8e17-c4b2-441d-adf9-707443ee2c72", + "timestamp": 1697835227560376025, + "model": [ + { + "name": "my-model", + "path": "/path/to/my-exp/my-model", + "exe_args": [ + "hello", + "world" + ], + "run_settings": { + "exe": [ + "/usr/bin/echo" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": { + "settings": { + "port": 5757, + "ifname": "lo", + "cpus": 1, + "custom_pinning": "0", + "debug": false, + "db_identifier": "COLO", + "rai_args": { + "threads_per_queue": null, + "inter_op_parallelism": null, + "intra_op_parallelism": null + }, + "extra_db_args": {} + }, + "scripts": [], + "models": [ + { + "cnn": { + "backend": "TORCH", + "device": "CPU" + } + } + ] + }, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/e41f8e17-c4b2-441d-adf9-707443ee2c72/model/my-model", + "step_id": "4121904.0", + "task_id": "28277", + "managed": true + }, + "out_file": "/path/to/my-exp/my-model/my-model.out", + "err_file": "/path/to/my-exp/my-model/my-model.err" + } + ], + "orchestrator": [], + "ensemble": [] + }, + { + "run_id": "b33a5d27-6822-4795-8e0e-cfea18551fa4", + "timestamp": 1697835261956135240, + "model": [], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_0", + "hostname": "10.128.0.2", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_0-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true + } + }, + { + "name": "orchestrator_2", + "hostname": "10.128.0.4", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_2-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true + } + }, + { + "name": "orchestrator_1", + "hostname": "10.128.0.3", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_1-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true + } + } + ] + } + ], + "ensemble": [] + }, + { + "run_id": "45772df2-fd80-43fd-adf0-d5e319870182", + "timestamp": 1697835287798613875, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" + ], + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/path/to/my-exp/my-ens/my-ens_0", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_0", + "step_id": "4121904.2", + "task_id": "28333", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" + }, + { + "name": "my-ens_1", + "path": "/path/to/my-exp/my-ens/my-ens_1", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_1", + "step_id": "4121904.3", + "task_id": "28342", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/path/to/my-exp/my-ens/my-ens_2", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_2", + "step_id": "4121904.4", + "task_id": "28353", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/path/to/my-exp/my-ens/my-ens_3", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_3", + "step_id": "4121904.5", + "task_id": "28362", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" + }, + { + "name": "my-ens_4", + "path": "/path/to/my-exp/my-ens/my-ens_4", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_4", + "step_id": "4121904.6", + "task_id": "28371", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" + }, + { + "name": "my-ens_5", + "path": "/path/to/my-exp/my-ens/my-ens_5", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_5", + "step_id": "4121904.7", + "task_id": "28380", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" + }, + { + "name": "my-ens_6", + "path": "/path/to/my-exp/my-ens/my-ens_6", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_6", + "step_id": "4121904.8", + "task_id": "28389", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/path/to/my-exp/my-ens/my-ens_7", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_7", + "step_id": "4121904.9", + "task_id": "28398", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" + } + ] + } + ] + } + ] + } + diff --git a/tests/test_containers.py b/tests/test_containers.py index 0c6db8d49..e35b4f309 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -32,11 +32,14 @@ import pytest from smartsim import Experiment, status -from smartsim._core.utils import installed_redisai_backends from smartsim.database import Orchestrator -from smartsim.entity import Ensemble, Model +from smartsim.entity import Ensemble from smartsim.settings.containers import Singularity +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + # Check if singularity is available as command line tool singularity_exists = which("singularity") is not None containerURI = "docker://alrigazzi/smartsim-testing:latest" @@ -87,9 +90,8 @@ def test_singularity_commands(fileutils): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_basic(fileutils): +def test_singularity_basic(fileutils, test_dir): """Basic argument-less Singularity test""" - test_dir = fileutils.make_test_dir() container = Singularity(containerURI) @@ -113,9 +115,9 @@ def test_singularity_basic(fileutils): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_args(fileutils): +def test_singularity_args(fileutils, test_dir): """Test combinations of args and mount arguments for Singularity""" - test_dir = fileutils.make_test_dir() + hometest_dir = os.path.join(str(Path.home()), "test") # $HOME/test mount_paths = {test_dir + "/singularity_args": hometest_dir} container = Singularity(containerURI, args="--contain", mount=mount_paths) @@ -140,7 +142,7 @@ def test_singularity_args(fileutils): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_smartredis(fileutils, wlmutils): +def test_singularity_smartredis(test_dir, fileutils, wlmutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. @@ -148,7 +150,6 @@ def test_singularity_smartredis(fileutils, wlmutils): Note: This is a containerized port of test_smartredis.py """ - test_dir = fileutils.make_test_dir() exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" ) diff --git a/tests/test_controller.py b/tests/test_controller.py new file mode 100644 index 000000000..65687ec59 --- /dev/null +++ b/tests/test_controller.py @@ -0,0 +1,75 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib + +import pytest + +from smartsim._core.control.controller import Controller +from smartsim._core.launcher.step import Step +from smartsim.database.orchestrator import Orchestrator +from smartsim.entity.ensemble import Ensemble +from smartsim.settings.slurmSettings import SbatchSettings, SrunSettings + +controller = Controller() + +rs = SrunSettings("echo", ["spam", "eggs"]) +bs = SbatchSettings() + +ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) +orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") + + +class MockStep(Step): + @staticmethod + def _create_unique_name(name): + return name + + def add_to_batch(self, step): ... + + def get_launch_cmd(self): + return [] + + +@pytest.mark.parametrize( + "collection", + [ + pytest.param(ens, id="Ensemble"), + pytest.param(orc, id="Database"), + ], +) +def test_controller_batch_step_creation_preserves_entity_order(collection, monkeypatch): + monkeypatch.setattr( + controller._launcher, + "create_step", + lambda name, path, settings: MockStep(name, path, settings), + ) + entity_names = [x.name for x in collection.entities] + assert len(entity_names) == len(set(entity_names)) + _, steps = controller._create_batch_job_step( + collection, pathlib.Path("mock/exp/path") + ) + assert entity_names == [step.name for step in steps] diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index e9e6cb0f5..a40ccdf66 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -34,6 +34,9 @@ from smartsim.error.errors import SSUnsupportedError from smartsim.settings import RunSettings +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + def test_finished_entity_orc_error(): """Orchestrators are never 'finished', either run forever or stopped by user""" @@ -97,7 +100,7 @@ def test_wrong_orchestrator(wlmutils): cont = Controller(launcher="local") manifest = Manifest(orc) with pytest.raises(SmartSimError): - cont._launch(manifest) + cont._launch("exp_name", "exp_path", manifest) def test_bad_orc_checkpoint(): diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index a4f4f641e..ec0ed23ea 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -24,13 +24,22 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import io +import json +import random +import string +import textwrap import pytest from smartsim import Experiment from smartsim.database import Orchestrator +from smartsim.entity.dbnode import DBNode, LaunchedShardData from smartsim.error.errors import SmartSimError +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + def test_parse_db_host_error(): orc = Orchestrator() @@ -40,30 +49,88 @@ def test_parse_db_host_error(): orc.entities[0].host -def test_hosts(fileutils, wlmutils): +def test_hosts(test_dir, wlmutils): exp_name = "test_hosts" - exp = Experiment(exp_name) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, exp_path=test_dir) orc = Orchestrator(port=wlmutils.get_test_port(), interface="lo", launcher="local") orc.set_path(test_dir) exp.start(orc) - thrown = False hosts = [] try: hosts = orc.hosts - except SmartSimError: - thrown = True + assert len(hosts) == orc.db_nodes == 1 finally: # stop the database even if there is an error raised exp.stop(orc) orc.remove_stale_files() - assert not thrown - assert hosts == orc.hosts + + +def _random_shard_info(): + rand_string = lambda: "".join(random.choices(string.ascii_letters, k=10)) + rand_num = lambda: random.randint(1000, 9999) + flip_coin = lambda: random.choice((True, False)) + + return LaunchedShardData( + name=rand_string(), + hostname=rand_string(), + port=rand_num(), + cluster=flip_coin(), + ) + + +def test_launched_shard_info_can_be_serialized(): + shard_data = _random_shard_info() + shard_data_from_str = LaunchedShardData( + **json.loads(json.dumps(shard_data.to_dict())) + ) + + assert shard_data is not shard_data_from_str + assert shard_data == shard_data_from_str + + +@pytest.mark.parametrize("limit", [None, 1]) +def test_db_node_can_parse_launched_shard_info(limit): + rand_shards = [_random_shard_info() for _ in range(3)] + with io.StringIO(textwrap.dedent("""\ + This is some file like str + -------------------------- + + SMARTSIM_ORC_SHARD_INFO: {} + ^^^^^^^^^^^^^^^^^^^^^^^ + We should be able to parse the serialized + launched db info from this file if the line is + prefixed with this tag. + + Here are two more for good measure: + SMARTSIM_ORC_SHARD_INFO: {} + SMARTSIM_ORC_SHARD_INFO: {} + + All other lines should be ignored. + """).format(*(json.dumps(s.to_dict()) for s in rand_shards))) as stream: + parsed_shards = DBNode._parse_launched_shard_info_from_iterable(stream, limit) + if limit is not None: + rand_shards = rand_shards[:limit] + assert rand_shards == parsed_shards def test_set_host(): orc = Orchestrator() - orc.entities[0].set_host("host") - assert orc.entities[0]._host == "host" + orc.entities[0].set_hosts(["host"]) + assert orc.entities[0].host == "host" + + +@pytest.mark.parametrize("nodes, mpmd", [[3, False], [3, True], [1, False]]) +def test_db_id_and_name(mpmd, nodes, wlmutils): + if nodes > 1 and wlmutils.get_test_launcher() not in pytest.wlm_options: + pytest.skip(reason="Clustered DB can only be checked on WLMs") + orc = Orchestrator( + db_identifier="test_db", + db_nodes=nodes, + single_cmd=mpmd, + launcher=wlmutils.get_test_launcher(), + ) + for i, node in enumerate(orc.entities): + assert node.name == f"{orc.name}_{i}" + assert node.db_identifier == orc.db_identifier diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py index 2326eb826..4545e80bf 100644 --- a/tests/test_ensemble.py +++ b/tests/test_ensemble.py @@ -34,6 +34,10 @@ from smartsim.error import EntityExistsError, SSUnsupportedError, UserStrategyError from smartsim.settings import RunSettings +# The tests in this file belong to the slow_tests group +pytestmark = pytest.mark.slow_tests + + """ Test ensemble creation @@ -45,7 +49,7 @@ # ---- helpers ------------------------------------------------------ -def step_values(param_names, param_values, n_models = 0): +def step_values(param_names, param_values, n_models=0): permutations = [] for p in zip(*param_values): permutations.append(dict(zip(param_names, p))) @@ -54,13 +58,13 @@ def step_values(param_names, param_values, n_models = 0): # bad permutation strategy that doesn't return # a list of dictionaries -def bad_strategy(names, values, n_models = 0): +def bad_strategy(names, values, n_models=0): return -1 # test bad perm strategy that returns a list but of lists # not dictionaries -def bad_strategy_2(names, values, n_models = 0): +def bad_strategy_2(names, values, n_models=0): return [values] @@ -226,6 +230,7 @@ def test_arg_and_model_params_all_perms(): # ----- Error Handling -------------------------------------- + # unknown permuation strategy def test_unknown_perm_strat(): bad_strat = "not-a-strategy" diff --git a/tests/test_entitylist.py b/tests/test_entitylist.py index afb88bd57..675e84426 100644 --- a/tests/test_entitylist.py +++ b/tests/test_entitylist.py @@ -33,6 +33,9 @@ from smartsim.entity import EntityList from smartsim.settings import RunSettings +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + def test_entity_list_init(): with pytest.raises(NotImplementedError): diff --git a/tests/test_experiment.py b/tests/test_experiment.py index dbaa51bdc..c0185ab6d 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -23,21 +23,25 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import contextlib +import os import pytest from smartsim import Experiment +from smartsim._core.config import CONFIG from smartsim.entity import Model from smartsim.error import SmartSimError from smartsim.settings import RunSettings +from smartsim.status import STATUS_NEVER_STARTED + +# The tests in this file belong to the slow_tests group +pytestmark = pytest.mark.slow_tests -def test_model_prefix(fileutils): +def test_model_prefix(test_dir): exp_name = "test_prefix" exp = Experiment(exp_name) - test_dir = fileutils.make_test_dir() + model = exp.create_model( "model", path=test_dir, @@ -83,8 +87,7 @@ def test_status_typeerror(): def test_status_pre_launch(): model = Model("name", {}, "./", RunSettings("python")) exp = Experiment("test") - with pytest.raises(SmartSimError): - exp.get_status(model) + assert exp.get_status(model)[0] == STATUS_NEVER_STARTED def test_bad_ensemble_init_no_rs(): @@ -108,10 +111,9 @@ def test_bad_ensemble_init_no_rs_bs(): exp.create_ensemble("name") -def test_stop_entity(fileutils): +def test_stop_entity(test_dir): exp_name = "test_stop_entity" - exp = Experiment(exp_name) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, exp_path=test_dir) m = exp.create_model("model", path=test_dir, run_settings=RunSettings("sleep", "5")) exp.start(m, block=False) assert exp.finished(m) == False @@ -119,11 +121,10 @@ def test_stop_entity(fileutils): assert exp.finished(m) == True -def test_poll(fileutils): +def test_poll(test_dir): # Ensure that a SmartSimError is not raised exp_name = "test_exp_poll" - exp = Experiment(exp_name) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, exp_path=test_dir) model = exp.create_model( "model", path=test_dir, run_settings=RunSettings("sleep", "5") ) @@ -132,15 +133,14 @@ def test_poll(fileutils): exp.stop(model) -def test_summary(fileutils): +def test_summary(test_dir): exp_name = "test_exp_summary" - exp = Experiment(exp_name) - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, exp_path=test_dir) m = exp.create_model( "model", path=test_dir, run_settings=RunSettings("echo", "Hello") ) exp.start(m) - summary_str = exp.summary(format="plain") + summary_str = exp.summary(style="plain") print(summary_str) summary_lines = summary_str.split("\n") @@ -155,6 +155,7 @@ def test_summary(fileutils): assert 0 == int(row["RunID"]) assert 0 == int(row["Returncode"]) + def test_launcher_detection(wlmutils, monkeypatch): if wlmutils.get_test_launcher() == "pals": pytest.skip(reason="Launcher detection cannot currently detect pbs vs pals") @@ -164,3 +165,16 @@ def test_launcher_detection(wlmutils, monkeypatch): exp = Experiment("test-launcher-detection", launcher="auto") assert exp._launcher == wlmutils.get_test_launcher() + + +def test_enable_disable_telemtery(monkeypatch): + # TODO: Currently these are implemented by setting an environment variable + # so that ALL experiments instanced in a driver script will begin + # producing telemetry data. In the future it is planned to have this + # work on a "per-instance" basis + monkeypatch.setattr(os, "environ", {}) + exp = Experiment("my-exp") + exp.enable_telemetry() + assert CONFIG.telemetry_enabled + exp.disable_telemetry() + assert not CONFIG.telemetry_enabled diff --git a/tests/test_generator.py b/tests/test_generator.py index 4307b2c2c..e4618f9cd 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -24,17 +24,20 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import filecmp from os import path as osp import pytest +from tabulate import tabulate from smartsim import Experiment from smartsim._core.generation import Generator from smartsim.database import Orchestrator from smartsim.settings import RunSettings -from tabulate import tabulate -from smartsim.settings import SbatchSettings +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + rs = RunSettings("python", exe_args="sleep.py") @@ -50,15 +53,18 @@ """ -def test_ensemble(fileutils): +def get_gen_file(fileutils, filename): + return fileutils.get_test_conf_path(osp.join("generator_files", filename)) + + +def test_ensemble(fileutils, test_dir): exp = Experiment("gen-test", launcher="local") - test_dir = fileutils.get_test_dir() - gen = Generator(test_dir) + gen = Generator(test_dir) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test", params=params, run_settings=rs) - config = fileutils.get_test_conf_path("in.atm") + config = get_gen_file(fileutils, "in.atm") ensemble.attach_generator_files(to_configure=config) gen.generate_experiment(ensemble) @@ -68,20 +74,20 @@ def test_ensemble(fileutils): assert osp.isdir(osp.join(test_dir, "test/test_" + str(i))) -def test_ensemble_overwrite(fileutils): +def test_ensemble_overwrite(fileutils, test_dir): exp = Experiment("gen-test-overwrite", launcher="local") - test_dir = fileutils.get_test_dir() + gen = Generator(test_dir, overwrite=True) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test", params=params, run_settings=rs) - config = fileutils.get_test_conf_path("in.atm") + config = get_gen_file(fileutils, "in.atm") ensemble.attach_generator_files(to_configure=[config]) gen.generate_experiment(ensemble) # re generate without overwrite - config = fileutils.get_test_conf_path("in.atm") + config = get_gen_file(fileutils, "in.atm") ensemble.attach_generator_files(to_configure=[config]) gen.generate_experiment(ensemble) @@ -91,27 +97,26 @@ def test_ensemble_overwrite(fileutils): assert osp.isdir(osp.join(test_dir, "test/test_" + str(i))) -def test_ensemble_overwrite_error(fileutils): +def test_ensemble_overwrite_error(fileutils, test_dir): exp = Experiment("gen-test-overwrite-error", launcher="local") - test_dir = fileutils.get_test_dir() + gen = Generator(test_dir) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test", params=params, run_settings=rs) - config = fileutils.get_test_conf_path("in.atm") + config = get_gen_file(fileutils, "in.atm") ensemble.attach_generator_files(to_configure=[config]) gen.generate_experiment(ensemble) # re generate without overwrite - config = fileutils.get_test_conf_path("in.atm") + config = get_gen_file(fileutils, "in.atm") ensemble.attach_generator_files(to_configure=[config]) with pytest.raises(FileExistsError): gen.generate_experiment(ensemble) -def test_full_exp(fileutils, wlmutils): - test_dir = fileutils.make_test_dir() +def test_full_exp(fileutils, test_dir, wlmutils): exp = Experiment("gen-test", test_dir, launcher="local") model = exp.create_model("model", run_settings=rs) @@ -122,7 +127,7 @@ def test_full_exp(fileutils, wlmutils): params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test_ens", params=params, run_settings=rs) - config = fileutils.get_test_conf_path("in.atm") + config = get_gen_file(fileutils, "in.atm") ensemble.attach_generator_files(to_configure=config) exp.generate(orc, ensemble, model) @@ -132,40 +137,38 @@ def test_full_exp(fileutils, wlmutils): assert osp.isdir(osp.join(test_dir, "test_ens/test_ens_" + str(i))) # test for orc dir - assert osp.isdir(osp.join(test_dir, "database")) + assert osp.isdir(osp.join(test_dir, orc.name)) # test for model file assert osp.isdir(osp.join(test_dir, "model")) assert osp.isfile(osp.join(test_dir, "model/sleep.py")) -def test_dir_files(fileutils): +def test_dir_files(fileutils, test_dir): """test the generate of models with files that are directories with subdirectories and files """ - test_dir = fileutils.make_test_dir() exp = Experiment("gen-test", test_dir, launcher="local") params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("dir_test", params=params, run_settings=rs) - conf_dir = fileutils.get_test_dir_path("test_dir") + conf_dir = get_gen_file(fileutils, "test_dir") ensemble.attach_generator_files(to_configure=conf_dir) - exp.generate(ensemble) + exp.generate(ensemble, tag="@") assert osp.isdir(osp.join(test_dir, "dir_test/")) for i in range(9): model_path = osp.join(test_dir, "dir_test/dir_test_" + str(i)) assert osp.isdir(model_path) assert osp.isdir(osp.join(model_path, "test_dir_1")) - assert osp.isfile(osp.join(model_path, "test.py")) + assert osp.isfile(osp.join(model_path, "test.in")) -def test_print_files(fileutils, capsys): +def test_print_files(fileutils, test_dir, capsys): """Test the stdout print of files attached to an ensemble""" - test_dir = fileutils.make_test_dir() exp = Experiment("print-attached-files-test", test_dir, launcher="local") ensemble = exp.create_ensemble("dir_test", replicas=1, run_settings=rs) @@ -177,9 +180,9 @@ def test_print_files(fileutils, capsys): params = {"THERMO": [10, 20], "STEPS": [20, 30]} ensemble = exp.create_ensemble("dir_test", params=params, run_settings=rs) - gen_dir = fileutils.get_test_dir_path("test_dir") - symlink_dir = fileutils.get_test_dir_path("to_symlink_dir") - copy_dir = fileutils.get_test_dir_path("to_copy_dir") + gen_dir = get_gen_file(fileutils, "test_dir") + symlink_dir = get_gen_file(fileutils, "to_symlink_dir") + copy_dir = get_gen_file(fileutils, "to_copy_dir") ensemble.print_attached_files() captured = capsys.readouterr() @@ -243,9 +246,8 @@ def test_print_files(fileutils, capsys): assert captured.out == expected_out_multi -def test_multiple_tags(fileutils): +def test_multiple_tags(fileutils, test_dir): """Test substitution of multiple tagged parameters on same line""" - test_dir = fileutils.make_test_dir() exp = Experiment("test-multiple-tags", test_dir) model_params = {"port": 6379, "password": "unbreakable_password"} @@ -253,30 +255,65 @@ def test_multiple_tags(fileutils): parameterized_model = exp.create_model( "multi-tags", run_settings=model_settings, params=model_params ) - config = fileutils.get_test_conf_path("multi_tags_template.sh") + config = get_gen_file(fileutils, "multi_tags_template.sh") parameterized_model.attach_generator_files(to_configure=[config]) exp.generate(parameterized_model, overwrite=True) exp.start(parameterized_model, block=True) with open(osp.join(parameterized_model.path, "multi-tags.out")) as f: - line = f.readline() - assert ( - line.strip() == "My two parameters are 6379 and unbreakable_password, OK?" + log_content = f.read() + assert "My two parameters are 6379 and unbreakable_password, OK?" in log_content + + +def test_generation_log(fileutils, test_dir): + """Test that an error is issued when a tag is unused and make_fatal is True""" + + exp = Experiment("gen-log-test", test_dir, launcher="local") + + params = {"THERMO": [10, 20], "STEPS": [10, 20]} + ensemble = exp.create_ensemble("dir_test", params=params, run_settings=rs) + conf_file = get_gen_file(fileutils, "in.atm") + ensemble.attach_generator_files(to_configure=conf_file) + + def not_header(line): + """you can add other general checks in here""" + return not line.startswith("Generation start date and time:") + + exp.generate(ensemble, verbose=True) + + log_file = osp.join(test_dir, "smartsim_params.txt") + ground_truth = get_gen_file( + fileutils, osp.join("log_params", "smartsim_params.txt") + ) + + with open(log_file) as f1, open(ground_truth) as f2: + assert not not_header(f1.readline()) + f1 = filter(not_header, f1) + f2 = filter(not_header, f2) + assert all(x == y for x, y in zip(f1, f2)) + + for entity in ensemble: + assert filecmp.cmp( + osp.join(entity.path, "smartsim_params.txt"), + get_gen_file( + fileutils, + osp.join("log_params", "dir_test", entity.name, "smartsim_params.txt"), + ), ) -def test_config_dir(fileutils): +def test_config_dir(fileutils, test_dir): """Test the generation and configuration of models with tagged files that are directories with subdirectories and files """ exp = Experiment("config-dir", launcher="local") - test_dir = fileutils.make_test_dir() + gen = Generator(test_dir) params = {"PARAM0": [0, 1], "PARAM1": [2, 3]} ensemble = exp.create_ensemble("test", params=params, run_settings=rs) - config = fileutils.get_test_conf_path("tag_dir_template") + config = get_gen_file(fileutils, "tag_dir_template") ensemble.attach_generator_files(to_configure=config) gen.generate_experiment(ensemble) @@ -308,7 +345,7 @@ def test_no_gen_if_file_not_exist(fileutils): """ exp = Experiment("file-not-found", launcher="local") ensemble = exp.create_ensemble("test", params={"P": [0, 1]}, run_settings=rs) - config = fileutils.get_test_conf_path("path_not_exist") + config = get_gen_file(fileutils, "path_not_exist") with pytest.raises(FileNotFoundError): ensemble.attach_generator_files(to_configure=config) @@ -320,6 +357,23 @@ def test_no_gen_if_symlink_to_dir(fileutils): """ exp = Experiment("circular-config-files", launcher="local") ensemble = exp.create_ensemble("test", params={"P": [0, 1]}, run_settings=rs) - config = fileutils.get_test_conf_path("circular_config") + config = get_gen_file(fileutils, "circular_config") with pytest.raises(ValueError): ensemble.attach_generator_files(to_configure=config) + + +def test_no_file_overwrite(): + exp = Experiment("test_no_file_overwrite", launcher="local") + ensemble = exp.create_ensemble("test", params={"P": [0, 1]}, run_settings=rs) + with pytest.raises(ValueError): + ensemble.attach_generator_files( + to_configure=["/normal/file.txt", "/path/to/smartsim_params.txt"] + ) + with pytest.raises(ValueError): + ensemble.attach_generator_files( + to_symlink=["/normal/file.txt", "/path/to/smartsim_params.txt"] + ) + with pytest.raises(ValueError): + ensemble.attach_generator_files( + to_copy=["/normal/file.txt", "/path/to/smartsim_params.txt"] + ) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 66af2dc0f..784219f82 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -24,22 +24,47 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest +from smartsim._core.utils import helpers from smartsim._core.utils.helpers import cat_arg_and_value +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + def test_double_dash_concat(): result = cat_arg_and_value("--foo", "FOO") assert result == "--foo=FOO" + def test_single_dash_concat(): result = cat_arg_and_value("-foo", "FOO") assert result == "-foo FOO" + def test_single_char_concat(): result = cat_arg_and_value("x", "FOO") assert result == "-x FOO" + def test_fallthrough_concat(): result = cat_arg_and_value("xx", "FOO") # <-- no dashes, > 1 char assert result == "--xx=FOO" + + +def test_encode_decode_cmd_round_trip(): + orig_cmd = ["this", "is", "a", "cmd"] + decoded_cmd = helpers.decode_cmd(helpers.encode_cmd(orig_cmd)) + assert orig_cmd == decoded_cmd + assert orig_cmd is not decoded_cmd + + +def test_encode_raises_on_empty(): + with pytest.raises(ValueError): + helpers.encode_cmd([]) + + +def test_decode_raises_on_empty(): + with pytest.raises(ValueError): + helpers.decode_cmd("") diff --git a/tests/test_indirect.py b/tests/test_indirect.py new file mode 100644 index 000000000..f8af88266 --- /dev/null +++ b/tests/test_indirect.py @@ -0,0 +1,204 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import pathlib +import sys + +import psutil +import pytest + +from smartsim._core.entrypoints.indirect import cleanup, get_parser, get_ts, main +from smartsim._core.utils.helpers import encode_cmd +from smartsim._core.utils.serialize import MANIFEST_FILENAME, TELMON_SUBDIR + +ALL_ARGS = { + "+command", + "+entity_type", + "+telemetry_dir", + "+output_file", + "+error_file", + "+working_dir", +} + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +# fmt: off +@pytest.mark.parametrize( + ["cmd", "missing"], + [ + pytest.param("indirect.py", {"+name", "+command", "+entity_type", "+telemetry_dir", "+working_dir"}, id="no args"), + pytest.param("indirect.py -c echo +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="cmd typo"), + pytest.param("indirect.py -t orchestrator +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="etype typo"), + pytest.param("indirect.py -d /foo/bar +entity_type ttt +command ccc +output_file ooo +working_dir www +error_file eee", {"+telemetry_dir"}, id="dir typo"), + pytest.param("indirect.py +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="no cmd"), + pytest.param("indirect.py +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="no etype"), + pytest.param("indirect.py +command ccc +entity_type ttt +output_file ooo +working_dir www +error_file eee", {"+telemetry_dir"}, id="no dir"), + ] +) +# fmt: on +def test_parser(capsys, cmd, missing): + """Test that the parser reports any missing required arguments""" + parser = get_parser() + + args = cmd.split() + + captured = capsys.readouterr() # throw away existing output + with pytest.raises(SystemExit) as ex: + ns = parser.parse_args(args) + + captured = capsys.readouterr() + assert "the following arguments are required" in captured.err + for arg in missing: + assert arg in captured.err + + expected = ALL_ARGS - missing + msg_tuple = captured.err.split("the following arguments are required: ") + if len(msg_tuple) < 2: + assert False, "error message indicates no missing arguments" + + actual_missing = msg_tuple[1].strip() + for exp in expected: + assert f"{exp}/" not in actual_missing + + +def test_cleanup(capsys, monkeypatch): + """Ensure cleanup attempts termination of correct process""" + mock_pid = 123 + create_msg = "creating: {0}" + term_msg = "terminating: {0}" + + class MockProc: + def __init__(self, pid: int): + print(create_msg.format(pid)) + + def terminate(self): + print(term_msg.format(mock_pid)) + + captured = capsys.readouterr() # throw away existing output + + with monkeypatch.context() as ctx: + ctx.setattr("psutil.pid_exists", lambda pid: True) + ctx.setattr("psutil.Process", MockProc) + ctx.setattr("smartsim._core.entrypoints.indirect.STEP_PID", mock_pid) + cleanup() + + captured = capsys.readouterr() + assert create_msg.format(mock_pid) in captured.out + assert term_msg.format(mock_pid) in captured.out + + +def test_cleanup_late(capsys, monkeypatch): + """Ensure cleanup exceptions are swallowed if a process is already terminated""" + mock_pid = 123 + create_msg = "creating: {0}" + term_msg = "terminating: {0}" + + class MockMissingProc: + def __init__(self, pid: int) -> None: + print(create_msg.format(mock_pid)) + raise psutil.NoSuchProcess(pid) + + def terminate(self) -> None: + print(term_msg.format(mock_pid)) + + captured = capsys.readouterr() # throw away existing output + + with monkeypatch.context() as ctx: + ctx.setattr("psutil.pid_exists", lambda pid: True) + ctx.setattr("psutil.Process", MockMissingProc) + ctx.setattr("smartsim._core.entrypoints.indirect.STEP_PID", mock_pid) + cleanup() + + captured = capsys.readouterr() + assert create_msg.format(mock_pid) in captured.out + + +def test_ts(): + """Ensure expected output type""" + ts = get_ts() + assert isinstance(ts, int) + + +def test_indirect_main_dir_check(test_dir): + """Ensure that the proxy validates the test directory exists""" + exp_dir = pathlib.Path(test_dir) + + cmd = ["echo", "unit-test"] + encoded_cmd = encode_cmd(cmd) + + status_path = exp_dir / TELMON_SUBDIR + + # show that a missing status_path is created when missing + main(encoded_cmd, "application", exp_dir, status_path) + + assert status_path.exists() + + +def test_indirect_main_cmd_check(capsys, test_dir, monkeypatch): + """Ensure that the proxy validates the cmd is not empty or whitespace-only""" + exp_dir = pathlib.Path(test_dir) + + captured = capsys.readouterr() # throw away existing output + with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: + ctx.setattr("smartsim._core.entrypoints.indirect.logger.error", print) + _ = main("", "application", exp_dir, exp_dir / TELMON_SUBDIR) + + captured = capsys.readouterr() + assert "Invalid cmd supplied" in ex.value.args[0] + + # test with non-emptystring cmd + with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: + ctx.setattr("smartsim._core.entrypoints.indirect.logger.error", print) + _ = main(" \n \t ", "application", exp_dir, exp_dir / TELMON_SUBDIR) + + captured = capsys.readouterr() + assert "Invalid cmd supplied" in ex.value.args[0] + + +def test_complete_process(fileutils, test_dir): + """Ensure the happy-path completes and returns a success return code""" + script = fileutils.get_test_conf_path("sleep.py") + + exp_dir = pathlib.Path(test_dir) + + raw_cmd = f"{sys.executable} {script} --time=1" + cmd = encode_cmd(raw_cmd.split()) + + rc = main(cmd, "application", exp_dir, exp_dir / TELMON_SUBDIR) + assert rc == 0 + + assert exp_dir.exists() + + # NOTE: don't have a manifest so we're falling back to default event path + data_dir = exp_dir / TELMON_SUBDIR + start_events = list(data_dir.rglob("start.json")) + stop_events = list(data_dir.rglob("stop.json")) + + assert start_events + assert stop_events diff --git a/tests/test_init.py b/tests/test_init.py index 2b1befa0f..76f58b59a 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -25,6 +25,12 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + __author__ = "Sam Partee" diff --git a/tests/test_interrupt.py b/tests/test_interrupt.py index f51d7fc4d..73616a848 100644 --- a/tests/test_interrupt.py +++ b/tests/test_interrupt.py @@ -24,15 +24,19 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import os import signal import time from threading import Thread +import pytest + from smartsim import Experiment from smartsim.settings import RunSettings +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + def keyboard_interrupt(pid): """Interrupt main thread""" @@ -40,13 +44,13 @@ def keyboard_interrupt(pid): os.kill(pid, signal.SIGINT) -def test_interrupt_blocked_jobs(fileutils): +def test_interrupt_blocked_jobs(test_dir): """ Launches and polls a model and an ensemble with two more models. Once polling starts, the SIGINT signal is sent to the main thread, and consequently, all running jobs are killed. """ - test_dir = fileutils.make_test_dir() + exp_name = "test_interrupt_blocked_jobs" exp = Experiment(exp_name, exp_path=test_dir) model = exp.create_model( @@ -77,7 +81,7 @@ def test_interrupt_blocked_jobs(fileutils): assert len(completed_jobs) == num_jobs -def test_interrupt_multi_experiment_unblocked_jobs(fileutils): +def test_interrupt_multi_experiment_unblocked_jobs(test_dir): """ Starts two Experiments, each having one model and an ensemble with two more models. Since @@ -85,7 +89,7 @@ def test_interrupt_multi_experiment_unblocked_jobs(fileutils): the SIGINT signal is sent, resulting in both Experiment's running jobs to be killed. """ - test_dir = fileutils.make_test_dir() + exp_names = ["test_interrupt_jobs_0", "test_interrupt_jobs_1"] experiments = [Experiment(exp_names[i], exp_path=test_dir) for i in range(2)] jobs_per_experiment = [0] * len(experiments) diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index 3f0c5a252..51d8b60a6 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -29,9 +29,12 @@ from smartsim import Experiment, status from smartsim.database import Orchestrator -from smartsim.error import SmartSimError, SSUnsupportedError +from smartsim.error import SSUnsupportedError from smartsim.settings import JsrunSettings, RunSettings +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + def test_unsupported_run_settings(): exp_name = "test-unsupported-run-settings" @@ -43,10 +46,9 @@ def test_unsupported_run_settings(): exp.start(model) -def test_model_failure(fileutils): +def test_model_failure(fileutils, test_dir): exp_name = "test-model-failure" - exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("bad.py") settings = RunSettings("python", f"{script} --time=3") @@ -58,19 +60,18 @@ def test_model_failure(fileutils): assert all([stat == status.STATUS_FAILED for stat in statuses]) -def test_orchestrator_relaunch(fileutils, wlmutils): - """Test error when users try to launch second orchestrator""" - exp_name = "test-orc-error-on-relaunch" - exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() +def test_orchestrator_relaunch(test_dir, wlmutils): + """Test when users try to launch second orchestrator""" + exp_name = "test-orc-on-relaunch" + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) orc = Orchestrator(port=wlmutils.get_test_port()) orc.set_path(test_dir) orc_1 = Orchestrator(port=wlmutils.get_test_port() + 1) orc_1.set_path(test_dir) - - exp.start(orc) - with pytest.raises(SmartSimError): + try: + exp.start(orc) exp.start(orc_1) - - exp.stop(orc) + finally: + exp.stop(orc) + exp.stop(orc_1) diff --git a/tests/test_local_launch.py b/tests/test_local_launch.py index 857855205..e4d593b6f 100644 --- a/tests/test_local_launch.py +++ b/tests/test_local_launch.py @@ -24,18 +24,22 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest from smartsim import Experiment, status +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + """ Test the launch of simple entity types with local launcher """ -def test_models(fileutils): +def test_models(fileutils, test_dir): exp_name = "test-models-local-launch" - exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -48,10 +52,9 @@ def test_models(fileutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_ensemble(fileutils): +def test_ensemble(fileutils, test_dir): exp_name = "test-ensemble-launch" - exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_local_multi_run.py b/tests/test_local_multi_run.py index 76bfc898c..b6eaba56a 100644 --- a/tests/test_local_multi_run.py +++ b/tests/test_local_multi_run.py @@ -24,18 +24,22 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest from smartsim import Experiment, status +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + """ Test the launch of simple entity types with local launcher """ -def test_models(fileutils): +def test_models(fileutils, test_dir): exp_name = "test-models-local-launch" - exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/test_local_restart.py b/tests/test_local_restart.py index 99c6afd5e..b05401138 100644 --- a/tests/test_local_restart.py +++ b/tests/test_local_restart.py @@ -24,19 +24,22 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest from smartsim import Experiment, status +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + """ Test restarting ensembles and models. """ -def test_restart(fileutils): - +def test_restart(fileutils, test_dir): exp_name = "test-models-local-restart" - exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -53,10 +56,9 @@ def test_restart(fileutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_ensemble(fileutils): +def test_ensemble(fileutils, test_dir): exp_name = "test-ensemble-restart" - exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_lsf_parser.py b/tests/test_lsf_parser.py index 3b90eb0f3..f41de54d8 100644 --- a/tests/test_lsf_parser.py +++ b/tests/test_lsf_parser.py @@ -24,9 +24,14 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest from smartsim._core.launcher.lsf import lsfParser +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + # -- bsub --------------------------------------------------------- diff --git a/tests/test_lsf_settings.py b/tests/test_lsf_settings.py index 2770bde21..a71d658cb 100644 --- a/tests/test_lsf_settings.py +++ b/tests/test_lsf_settings.py @@ -25,13 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from pprint import pformat - import pytest from smartsim.error import SSUnsupportedError from smartsim.settings import BsubBatchSettings, JsrunSettings +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + # ------ Jsrun ------------------------------------------------ @@ -105,16 +106,16 @@ def test_jsrun_args_mutation(): "np": 100, } settings = JsrunSettings("python", run_args=run_args) - + erf_settings = {"foo": "1", "bar": "2"} - + settings.set_erf_sets(erf_settings) assert settings.erf_sets["foo"] == "1" assert settings.erf_sets["bar"] == "2" erf_settings["foo"] = "111" erf_settings["bar"] = "111" - + assert settings.erf_sets["foo"] == "1" assert settings.erf_sets["bar"] == "2" @@ -241,7 +242,7 @@ def test_bsub_batch_manual(): def test_bsub_batch_alloc_flag_formatting_by_smt(): """Ensure that alloc_flags are formatted correctly when smts is changed""" - + # Check when no smt is set in the constructor sbatch = BsubBatchSettings() sbatch._format_alloc_flags() @@ -276,9 +277,9 @@ def test_bsub_batch_alloc_flag_formatting_by_smt(): # check multi-smt flag, with prefix sbatch = BsubBatchSettings(batch_args={"alloc_flags": '"smt3 smt4"'}, smts=4) sbatch._format_alloc_flags() - assert sbatch.batch_args["alloc_flags"] == "\"smt3 smt4\"" # <-- wrap in quotes - + assert sbatch.batch_args["alloc_flags"] == '"smt3 smt4"' # <-- wrap in quotes + # show that mismatched alloc_flags and smts are NOT touched - sbatch = BsubBatchSettings(batch_args={"alloc_flags": 'smt10'}, smts=2) + sbatch = BsubBatchSettings(batch_args={"alloc_flags": "smt10"}, smts=2) sbatch._format_alloc_flags() assert sbatch.batch_args["alloc_flags"] == "smt10" # <-- not smt2 diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 7a856aea7..ea9920fad 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -25,20 +25,26 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os.path from copy import deepcopy import pytest from smartsim import Experiment -from smartsim._core.control import Manifest +from smartsim._core.control.manifest import ( + LaunchedManifest, + LaunchedManifestBuilder, + Manifest, +) +from smartsim._core.control.manifest import ( + _LaunchedManifestMetadata as LaunchedManifestMetadata, +) from smartsim.database import Orchestrator from smartsim.error import SmartSimError from smartsim.settings import RunSettings -try: - import tensorflow -except ImportError: - pass +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b # ---- create entities for testing -------- @@ -50,7 +56,6 @@ model_2 = exp.create_model("model_1", run_settings=rs) ensemble = exp.create_ensemble("ensemble", run_settings=rs, replicas=1) - orc = Orchestrator() orc_1 = deepcopy(orc) orc_1.name = "orc2" @@ -63,7 +68,7 @@ def test_separate(): assert len(manifest.models) == 1 assert manifest.ensembles[0] == ensemble assert len(manifest.ensembles) == 1 - assert manifest.db == orc + assert manifest.dbs[0] == orc def test_no_name(): @@ -71,12 +76,6 @@ def test_no_name(): _ = Manifest(model_no_name) -def test_two_orc(): - with pytest.raises(SmartSimError): - manifest = Manifest(orc, orc_1) - manifest.db - - def test_separate_type(): with pytest.raises(TypeError): _ = Manifest([1, 2, 3]) @@ -91,7 +90,7 @@ def test_catch_empty_ensemble(): e = deepcopy(ensemble) e.entities = [] with pytest.raises(ValueError): - manifest = Manifest(e) + _ = Manifest(e) def test_corner_case(): @@ -105,3 +104,79 @@ class Person: p = Person() with pytest.raises(TypeError): _ = Manifest(p) + + +def test_launched_manifest_transform_data(): + models = [(model, 1), (model_2, 2)] + ensembles = [(ensemble, [(m, i) for i, m in enumerate(ensemble.entities)])] + dbs = [(orc, [(n, i) for i, n in enumerate(orc.entities)])] + launched = LaunchedManifest( + metadata=LaunchedManifestMetadata("name", "path", "launcher", "run_id"), + models=models, + ensembles=ensembles, + databases=dbs, + ) + transformed = launched.map(lambda x: str(x)) + assert transformed.models == tuple((m, str(i)) for m, i in models) + assert transformed.ensembles[0][1] == tuple((m, str(i)) for m, i in ensembles[0][1]) + assert transformed.databases[0][1] == tuple((n, str(i)) for n, i in dbs[0][1]) + + +def test_launched_manifest_builder_correctly_maps_data(): + lmb = LaunchedManifestBuilder("name", "path", "launcher name") + lmb.add_model(model, 1) + lmb.add_model(model_2, 1) + lmb.add_ensemble(ensemble, [i for i in range(len(ensemble.entities))]) + lmb.add_database(orc, [i for i in range(len(orc.entities))]) + + manifest = lmb.finalize() + assert len(manifest.models) == 2 + assert len(manifest.ensembles) == 1 + assert len(manifest.databases) == 1 + + +def test_launced_manifest_builder_raises_if_lens_do_not_match(): + lmb = LaunchedManifestBuilder("name", "path", "launcher name") + with pytest.raises(ValueError): + lmb.add_ensemble(ensemble, list(range(123))) + with pytest.raises(ValueError): + lmb.add_database(orc, list(range(123))) + + +def test_launched_manifest_builer_raises_if_attaching_data_to_empty_collection( + monkeypatch, +): + lmb = LaunchedManifestBuilder("name", "path", "launcher") + monkeypatch.setattr(ensemble, "entities", []) + with pytest.raises(ValueError): + lmb.add_ensemble(ensemble, []) + + +def test_lmb_and_launched_manifest_have_same_paths_for_launched_metadata(): + exp_path = "/path/to/some/exp" + lmb = LaunchedManifestBuilder("exp_name", exp_path, "launcher") + manifest = lmb.finalize() + assert ( + lmb.exp_telemetry_subdirectory == manifest.metadata.exp_telemetry_subdirectory + ) + assert ( + lmb.run_telemetry_subdirectory == manifest.metadata.run_telemetry_subdirectory + ) + assert ( + os.path.commonprefix( + [ + manifest.metadata.run_telemetry_subdirectory, + manifest.metadata.exp_telemetry_subdirectory, + manifest.metadata.manifest_file_path, + exp_path, + ] + ) + == exp_path + ) + assert os.path.commonprefix( + [ + manifest.metadata.run_telemetry_subdirectory, + manifest.metadata.exp_telemetry_subdirectory, + manifest.metadata.manifest_file_path, + ] + ) == str(manifest.metadata.exp_telemetry_subdirectory) diff --git a/tests/test_model.py b/tests/test_model.py index feab88a90..88700ad23 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -27,12 +27,16 @@ import pytest from smartsim import Experiment +from smartsim._core.control.manifest import LaunchedManifestBuilder from smartsim._core.launcher.step import SbatchStep, SrunStep from smartsim.entity import Ensemble, Model from smartsim.error import EntityExistsError, SSUnsupportedError from smartsim.settings import RunSettings, SbatchSettings, SrunSettings from smartsim.settings.mpiSettings import _BaseMPISettings +# The tests in this file belong to the slow_tests group +pytestmark = pytest.mark.slow_tests + def test_register_incoming_entity_preexists(): exp = Experiment("experiment", launcher="local") @@ -85,8 +89,11 @@ def monkeypatch_exp_controller(monkeypatch): def _monkeypatch_exp_controller(exp): entity_steps = [] - def start_wo_job_manager(self, manifest, block=True, kill_on_interrupt=True): - self._launch(manifest) + def start_wo_job_manager( + self, exp_name, exp_path, manifest, block=True, kill_on_interrupt=True + ): + self._launch(exp_name, exp_path, manifest) + return LaunchedManifestBuilder("name", "path", "launcher").finalize() def launch_step_nop(self, step, entity): entity_steps.append((step, entity)) diff --git a/tests/test_modelwriter.py b/tests/test_modelwriter.py index ba5393c4f..4554a8b5a 100644 --- a/tests/test_modelwriter.py +++ b/tests/test_modelwriter.py @@ -32,16 +32,21 @@ import pytest from smartsim._core.generation.modelwriter import ModelWriter -from smartsim.error.errors import ParameterWriterError +from smartsim.error.errors import ParameterWriterError, SmartSimError from smartsim.settings import RunSettings +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + mw_run_settings = RunSettings("python", exe_args="sleep.py") -def test_write_easy_configs(fileutils): +def get_gen_file(fileutils, filename): + return fileutils.get_test_conf_path(path.join("generator_files", filename)) - test_dir = fileutils.make_test_dir() +def test_write_easy_configs(fileutils, test_dir): param_dict = { "5": 10, # MOM_input "FIRST": "SECOND", # example_input.i @@ -51,8 +56,8 @@ def test_write_easy_configs(fileutils): "1200": "120", # input.nml } - conf_path = fileutils.get_test_dir_path("easy/marked/") - correct_path = fileutils.get_test_dir_path("easy/correct/") + conf_path = get_gen_file(fileutils, "easy/marked/") + correct_path = get_gen_file(fileutils, "easy/correct/") # copy confs to gen directory dir_util.copy_tree(conf_path, test_dir) assert path.isdir(test_dir) @@ -68,10 +73,7 @@ def test_write_easy_configs(fileutils): assert filecmp.cmp(written, correct) -def test_write_med_configs(fileutils): - - test_dir = fileutils.make_test_dir() - +def test_write_med_configs(fileutils, test_dir): param_dict = { "1 0 0 0": "3 0 0 0", # in.ellipse.gayberne "'noleap'": "'leap'", # input.nml @@ -81,8 +83,8 @@ def test_write_med_configs(fileutils): "3*12.0": "3*14.0", # MOM_input } - conf_path = fileutils.get_test_dir_path("med/marked/") - correct_path = fileutils.get_test_dir_path("med/correct/") + conf_path = get_gen_file(fileutils, "med/marked/") + correct_path = get_gen_file(fileutils, "med/correct/") # copy confs to gen directory dir_util.copy_tree(conf_path, test_dir) @@ -101,11 +103,9 @@ def test_write_med_configs(fileutils): assert filecmp.cmp(written, correct) -def test_write_new_tag_configs(fileutils): +def test_write_new_tag_configs(fileutils, test_dir): """sets the tag to the dollar sign""" - test_dir = fileutils.make_test_dir() - param_dict = { "1 0 0 0": "3 0 0 0", # in.ellipse.gayberne "'noleap'": "'leap'", # input.nml @@ -115,8 +115,8 @@ def test_write_new_tag_configs(fileutils): "3*12.0": "3*14.0", # MOM_input } - conf_path = fileutils.get_test_dir_path("new-tag/marked/") - correct_path = fileutils.get_test_dir_path("new-tag/correct/") + conf_path = get_gen_file(fileutils, "new-tag/marked/") + correct_path = get_gen_file(fileutils, "new-tag/correct/") # copy confs to gen directory dir_util.copy_tree(conf_path, test_dir) @@ -144,3 +144,22 @@ def test_mw_error_2(): writer = ModelWriter() with pytest.raises(ParameterWriterError): writer._write_changes("[not/a/path]") + + +def test_write_mw_error_3(fileutils, test_dir): + param_dict = { + "5": 10, # MOM_input + } + + conf_path = get_gen_file(fileutils, "easy/marked/") + + # copy confs to gen directory + dir_util.copy_tree(conf_path, test_dir) + assert path.isdir(test_dir) + + # init modelwriter + writer = ModelWriter() + with pytest.raises(SmartSimError): + writer.configure_tagged_model_files( + glob(test_dir + "/*"), param_dict, make_missing_tags_fatal=True + ) diff --git a/tests/test_mpi_settings.py b/tests/test_mpi_settings.py index 275e8b75c..4e0bc48c8 100644 --- a/tests/test_mpi_settings.py +++ b/tests/test_mpi_settings.py @@ -40,6 +40,10 @@ _BaseMPISettings, ) +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + # Throw a warning instead of failing on machines without an MPI implementation default_mpi_args = (sys.executable,) default_mpi_kwargs = {"fail_if_missing_exec": False} @@ -115,7 +119,6 @@ def test_expected_openmpi_instance_without_warning( def test_error_if_slurm_mpiexec(fileutils): - stubs_path = osp.join("mpi_impl_stubs", "slurm") stubs_path = fileutils.get_test_dir_path(stubs_path) stub_exe = osp.join(stubs_path, "mpiexec") diff --git a/tests/test_multidb.py b/tests/test_multidb.py new file mode 100644 index 000000000..c4336294e --- /dev/null +++ b/tests/test_multidb.py @@ -0,0 +1,490 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from contextlib import contextmanager + +import pytest + +from smartsim import Experiment, status +from smartsim.database import Orchestrator +from smartsim.entity.entity import SmartSimEntity +from smartsim.error.errors import SSDBIDConflictError +from smartsim.log import get_logger + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + +logger = get_logger(__name__) + +supported_dbs = ["uds", "tcp"] + +on_wlm = (pytest.test_launcher in pytest.wlm_options,) + + +@contextmanager +def make_entity_context(exp: Experiment, entity: SmartSimEntity): + """Start entity in a context to ensure that it is always stopped""" + exp.generate(entity, overwrite=True) + try: + yield entity + finally: + if exp.get_status(entity)[0] == status.STATUS_RUNNING: + exp.stop(entity) + + +def choose_host(wlmutils, index=0): + hosts = wlmutils.get_test_hostlist() + if hosts: + return hosts[index] + else: + return None + + +def check_not_failed(exp, *args): + statuses = exp.get_status(*args) + assert all(stat is not status.STATUS_FAILED for stat in statuses) + + +@pytest.mark.parametrize("db_type", supported_dbs) +def test_db_identifier_standard_then_colo_error( + fileutils, wlmutils, coloutils, db_type, test_dir +): + """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp + with unique db_identifiers""" + + # Set experiment name + exp_name = "test_db_identifier_standard_then_colo" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + + test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create regular database + orc = exp.create_database( + port=test_port, + interface=test_interface, + db_identifier="testdb_colo", + hosts=choose_host(wlmutils), + ) + assert orc.name == "testdb_colo" + + db_args = { + "port": test_port + 1, + "db_cpus": 1, + "debug": True, + "db_identifier": "testdb_colo", + } + + smartsim_model = coloutils.setup_test_colo( + fileutils, db_type, exp, test_script, db_args, on_wlm=on_wlm + ) + + assert ( + smartsim_model.run_settings.colocated_db_settings["db_identifier"] + == "testdb_colo" + ) + + with make_entity_context(exp, orc), make_entity_context(exp, smartsim_model): + exp.start(orc) + with pytest.raises(SSDBIDConflictError) as ex: + exp.start(smartsim_model) + + assert ( + "has already been used. Pass in a unique name for db_identifier" + in ex.value.args[0] + ) + check_not_failed(exp, orc) + + +@pytest.mark.parametrize("db_type", supported_dbs) +def test_db_identifier_colo_then_standard( + fileutils, wlmutils, coloutils, db_type, test_dir +): + """Test colocate_db_uds/colocate_db_tcp then create_database with database + identifiers. + """ + + # Set experiment name + exp_name = "test_db_identifier_colo_then_standard" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_script = fileutils.get_test_conf_path("smartredis/dbid.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # Create run settings + colo_settings = exp.create_run_settings("python", test_script) + colo_settings.set_nodes(1) + colo_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("colocated_model", colo_settings) + smartsim_model.set_path(test_dir) + + db_args = { + "port": test_port, + "db_cpus": 1, + "debug": True, + "db_identifier": "testdb_colo", + } + + smartsim_model = coloutils.setup_test_colo( + fileutils, + db_type, + exp, + test_script, + db_args, + on_wlm=on_wlm, + ) + + assert ( + smartsim_model.run_settings.colocated_db_settings["db_identifier"] + == "testdb_colo" + ) + + # Create Database + orc = exp.create_database( + port=test_port + 1, + interface=test_interface, + db_identifier="testdb_colo", + hosts=choose_host(wlmutils), + ) + + assert orc.name == "testdb_colo" + + with make_entity_context(exp, orc), make_entity_context(exp, smartsim_model): + exp.start(smartsim_model, block=True) + exp.start(orc) + + check_not_failed(exp, orc, smartsim_model) + + +def test_db_identifier_standard_twice_not_unique(wlmutils, test_dir): + """Test uniqueness of db_identifier several calls to create_database, with non unique names, + checking error is raised before exp start is called""" + + # Set experiment name + exp_name = "test_db_identifier_multiple_create_database_not_unique" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # CREATE DATABASE with db_identifier + orc = exp.create_database( + port=test_port, + interface=test_interface, + db_identifier="my_db", + hosts=choose_host(wlmutils), + ) + + assert orc.name == "my_db" + + orc2 = exp.create_database( + port=test_port + 1, + interface=test_interface, + db_identifier="my_db", + hosts=choose_host(wlmutils, index=1), + ) + + assert orc2.name == "my_db" + + # CREATE DATABASE with db_identifier + with make_entity_context(exp, orc), make_entity_context(exp, orc2): + exp.start(orc) + with pytest.raises(SSDBIDConflictError) as ex: + exp.start(orc2) + assert ( + "has already been used. Pass in a unique name for db_identifier" + in ex.value.args[0] + ) + check_not_failed(exp, orc) + + +def test_db_identifier_create_standard_once(test_dir, wlmutils): + """One call to create database with a database identifier""" + + # Set experiment name + exp_name = "test_db_identifier_create_standard_once" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + + # Create the SmartSim Experiment + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Create the SmartSim database + db = exp.create_database( + port=test_port, + db_nodes=1, + interface=test_interface, + db_identifier="testdb_reg", + hosts=choose_host(wlmutils), + ) + with make_entity_context(exp, db): + exp.start(db) + + check_not_failed(exp, db) + + +def test_multidb_create_standard_twice(wlmutils, test_dir): + """Multiple calls to create database with unique db_identifiers""" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + + # start a new Experiment for this section + exp = Experiment( + "test_multidb_create_standard_twice", exp_path=test_dir, launcher=test_launcher + ) + + # create and start an instance of the Orchestrator database + db = exp.create_database( + port=test_port, + interface=test_interface, + db_identifier="testdb_reg", + hosts=choose_host(wlmutils, 1), + ) + + # create database with different db_id + db2 = exp.create_database( + port=test_port + 1, + interface=test_interface, + db_identifier="testdb_reg2", + hosts=choose_host(wlmutils, 2), + ) + + # launch + with make_entity_context(exp, db), make_entity_context(exp, db2): + exp.start(db, db2) + + with make_entity_context(exp, db), make_entity_context(exp, db2): + exp.start(db, db2) + + +@pytest.mark.parametrize("db_type", supported_dbs) +def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): + """create one model with colocated database with db_identifier""" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_port = wlmutils.get_test_port() + + test_script = fileutils.get_test_conf_path("smartredis/dbid.py") + + # start a new Experiment for this section + exp = Experiment( + "test_multidb_colo_once", launcher=test_launcher, exp_path=test_dir + ) + + # create run settings + run_settings = exp.create_run_settings("python", test_script) + run_settings.set_nodes(1) + run_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model.set_path(test_dir) + + db_args = { + "port": test_port + 1, + "db_cpus": 1, + "debug": True, + "db_identifier": "testdb_colo", + } + # Create model with colocated database + + smartsim_model = coloutils.setup_test_colo( + fileutils, + db_type, + exp, + test_script, + db_args, + on_wlm=on_wlm, + ) + + with make_entity_context(exp, smartsim_model): + exp.start(smartsim_model) + + check_not_failed(exp, smartsim_model) + + +@pytest.mark.parametrize("db_type", supported_dbs) +def test_multidb_standard_then_colo(fileutils, test_dir, wlmutils, coloutils, db_type): + """Create regular database then colocate_db_tcp/uds with unique db_identifiers""" + + # Retrieve parameters from testing environment + test_port = wlmutils.get_test_port() + + test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") + test_interface = wlmutils.get_test_interface() + test_launcher = wlmutils.get_test_launcher() + + # start a new Experiment for this section + exp = Experiment( + "test_multidb_standard_then_colo", exp_path=test_dir, launcher=test_launcher + ) + + # create and generate an instance of the Orchestrator database + db = exp.create_database( + port=test_port, + interface=test_interface, + db_identifier="testdb_reg", + hosts=choose_host(wlmutils), + ) + + db_args = { + "port": test_port + 1, + "db_cpus": 1, + "debug": True, + "db_identifier": "testdb_colo", + } + # Create model with colocated database + smartsim_model = coloutils.setup_test_colo( + fileutils, + db_type, + exp, + test_script, + db_args, + on_wlm=on_wlm, + ) + + with make_entity_context(exp, db), make_entity_context(exp, smartsim_model): + exp.start(db) + exp.start(smartsim_model, block=True) + + check_not_failed(exp, smartsim_model, db) + + +@pytest.mark.parametrize("db_type", supported_dbs) +def test_multidb_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, db_type): + """create regular database then colocate_db_tcp/uds with unique db_identifiers""" + + # Retrieve parameters from testing environment + test_port = wlmutils.get_test_port() + + test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") + test_interface = wlmutils.get_test_interface() + test_launcher = wlmutils.get_test_launcher() + + # start a new Experiment + exp = Experiment( + "test_multidb_colo_then_standard", exp_path=test_dir, launcher=test_launcher + ) + + db_args = { + "port": test_port, + "db_cpus": 1, + "debug": True, + "db_identifier": "testdb_colo", + } + + # Create model with colocated database + smartsim_model = coloutils.setup_test_colo( + fileutils, db_type, exp, test_script, db_args, on_wlm=on_wlm + ) + + # create and start an instance of the Orchestrator database + db = exp.create_database( + port=test_port + 1, + interface=test_interface, + db_identifier="testdb_reg", + hosts=choose_host(wlmutils), + ) + + with make_entity_context(exp, db), make_entity_context(exp, smartsim_model): + exp.start(db) + exp.start(smartsim_model, block=True) + + check_not_failed(exp, db, smartsim_model) + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +@pytest.mark.parametrize("db_type", supported_dbs) +def test_launch_cluster_orc_single_dbid( + test_dir, coloutils, fileutils, wlmutils, db_type +): + """test clustered 3-node orchestrator with single command with a database identifier""" + # TODO detect number of nodes in allocation and skip if not sufficent + + exp_name = "test_launch_cluster_orc_single_dbid" + launcher = wlmutils.get_test_launcher() + test_port = wlmutils.get_test_port() + test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + + # batch = False to launch on existing allocation + network_interface = wlmutils.get_test_interface() + orc: Orchestrator = exp.create_database( + wlmutils.get_test_port(), + db_nodes=3, + batch=False, + interface=network_interface, + single_cmd=True, + hosts=wlmutils.get_test_hostlist(), + db_identifier="testdb_reg", + ) + + db_args = { + "port": test_port, + "db_cpus": 1, + "debug": True, + "db_identifier": "testdb_colo", + } + + # Create model with colocated database + smartsim_model = coloutils.setup_test_colo( + fileutils, db_type, exp, test_script, db_args, on_wlm=on_wlm + ) + + with make_entity_context(exp, orc), make_entity_context(exp, smartsim_model): + exp.start(orc, block=True) + exp.start(smartsim_model, block=True) + job_dict = exp._control._jobs.get_db_host_addresses() + assert len(job_dict[orc.entities[0].db_identifier]) == 3 + + check_not_failed(exp, orc, smartsim_model) diff --git a/tests/test_orc_config_settings.py b/tests/test_orc_config_settings.py index 18286c463..f08467be0 100644 --- a/tests/test_orc_config_settings.py +++ b/tests/test_orc_config_settings.py @@ -36,6 +36,9 @@ except AttributeError: pytestmark = pytest.mark.skip(reason="SmartRedis version is < 0.3.1") +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + def test_config_methods(dbutils, local_db): """Test all configuration file edit methods on an active db""" diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index a634009d9..4a1b08367 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -33,6 +33,9 @@ from smartsim.error import SmartSimError from smartsim.error.errors import SSUnsupportedError +# The tests in this file belong to the slow_tests group +pytestmark = pytest.mark.slow_tests + def test_orc_parameters(): threads_per_queue = 2 @@ -65,10 +68,9 @@ def test_inactive_orc_get_address(): db.get_address() -def test_orc_active_functions(fileutils, wlmutils): +def test_orc_active_functions(test_dir, wlmutils): exp_name = "test_orc_active_functions" - exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) db = Orchestrator(port=wlmutils.get_test_port()) db.set_path(test_dir) @@ -93,10 +95,9 @@ def test_orc_active_functions(fileutils, wlmutils): db.get_address() -def test_multiple_interfaces(fileutils, wlmutils): +def test_multiple_interfaces(test_dir, wlmutils): exp_name = "test_multiple_interfaces" - exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) net_if_addrs = psutil.net_if_addrs() net_if_addrs = [ @@ -226,6 +227,35 @@ def test_slurm_set_batch_arg(wlmutils): assert orc2.batch_settings.batch_args["account"] == "ACCOUNT" +@pytest.mark.parametrize( + "single_cmd", + [ + pytest.param(True, id="Single MPMD `srun`"), + pytest.param(False, id="Multiple `srun`s"), + ], +) +def test_orc_results_in_correct_number_of_shards(single_cmd): + num_shards = 5 + orc = Orchestrator( + port=12345, + launcher="slurm", + run_command="srun", + db_nodes=num_shards, + batch=False, + single_cmd=single_cmd, + ) + if single_cmd: + assert len(orc.entities) == 1 + (node,) = orc.entities + assert len(node.run_settings.mpmd) == num_shards - 1 + else: + assert len(orc.entities) == num_shards + assert all(node.run_settings.mpmd == [] for node in orc.entities) + assert ( + orc.num_shards == orc.db_nodes == sum(node.num_shards for node in orc.entities) + ) + + ###### Cobalt ###### diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index 7bc3a6520..2cd725f65 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -31,14 +31,26 @@ import pytest -from smartsim.error import SSUnsupportedError -from smartsim.settings import PalsMpiexecSettings +import smartsim._core.config.config from smartsim._core.launcher import PBSLauncher from smartsim._core.launcher.step.mpiStep import MpiexecStep +from smartsim.error import SSUnsupportedError +from smartsim.settings import PalsMpiexecSettings + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + default_exe = sys.executable default_kwargs = {"fail_if_missing_exec": False} + +@pytest.fixture(autouse=True) +def turn_off_telemetry_indirect(monkeypatch): + monkeypatch.setattr(smartsim._core.config.config.Config, "telemetry_enabled", False) + yield + + # Uncomment when # @pytest.mark.parametrize( # "function_name",[ @@ -55,6 +67,12 @@ # func(None) +def test_affinity_script(): + settings = PalsMpiexecSettings(default_exe, **default_kwargs) + settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2) + assert settings.format_run_args() == ["/path/to/set_affinity_gpu.sh", "1", "2"] + + def test_cpu_binding_type(): settings = PalsMpiexecSettings(default_exe, **default_kwargs) settings.set_cpu_binding_type("numa") @@ -115,7 +133,7 @@ def set_env_var_to_inherit(rs): ], ) def test_pbs_can_make_step_from_pals_settings_fmt_cmd( - monkeypatch, mock_mpiexec, fileutils, rs_mutation, run_args + monkeypatch, mock_mpiexec, test_dir, rs_mutation, run_args ): # Setup run settings exe_args = ["-c", """'print("Hello")'"""] @@ -126,7 +144,7 @@ def test_pbs_can_make_step_from_pals_settings_fmt_cmd( launcher = PBSLauncher() monkeypatch.setenv(f"PBS_JOBID", "mock-job") - wdir = fileutils.make_test_dir() + wdir = test_dir step = launcher.create_step("my_step", wdir, rs) assert isinstance(step, MpiexecStep) assert step.get_launch_cmd() == [ @@ -139,7 +157,7 @@ def test_pbs_can_make_step_from_pals_settings_fmt_cmd( ] -def test_pals_settings_can_be_correctly_made_mpmd(monkeypatch, fileutils, mock_mpiexec): +def test_pals_settings_can_be_correctly_made_mpmd(monkeypatch, test_dir, mock_mpiexec): # Setup run settings def make_rs(exe, exe_args): return PalsMpiexecSettings(exe, exe_args), [exe] + exe_args @@ -166,7 +184,7 @@ def set_tasks(rs, num): launcher = PBSLauncher() monkeypatch.setenv(f"PBS_JOBID", "mock-job") - wdir = fileutils.make_test_dir() + wdir = test_dir step = launcher.create_step("my_step", wdir, rs_1) assert isinstance(step, MpiexecStep) assert step.get_launch_cmd() == [ diff --git a/tests/test_pbs_parser.py b/tests/test_pbs_parser.py index 638fa496c..554780cd7 100644 --- a/tests/test_pbs_parser.py +++ b/tests/test_pbs_parser.py @@ -24,11 +24,16 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from os.path import dirname from pathlib import Path +import pytest + from smartsim._core.launcher.pbs import pbsParser +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + # -- qsub --------------------------------------------------------- diff --git a/tests/test_pbs_settings.py b/tests/test_pbs_settings.py new file mode 100644 index 000000000..ed450bd82 --- /dev/null +++ b/tests/test_pbs_settings.py @@ -0,0 +1,128 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim.error import SSConfigError +from smartsim.settings import QsubBatchSettings + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + +def test_node_formatting(): + def validate_settings(settings, spec, num_nodes, num_cpus): + assert settings._create_resource_list() == [ + f"-l {spec}={num_nodes}:ncpus={num_cpus}" + ] + assert settings._ncpus == num_cpus + + num_nodes = 10 + num_cpus = 36 + + # Test by specifying the number of nodes via setting a resource + for spec in ["nodes", "select"]: + # Test by setting nodes + settings = QsubBatchSettings() + settings.set_resource(spec, num_nodes) + settings.set_ncpus(36) + validate_settings(settings, spec, num_nodes, num_cpus) + + # Test when setting nodes through the constructor + settings = QsubBatchSettings(ncpus=num_cpus, nodes=num_nodes) + validate_settings(settings, "nodes", num_nodes, num_cpus) + + # Test when setting nodes through the constructor via resource + settings = QsubBatchSettings(ncpus=num_cpus, resources={"nodes": num_nodes}) + validate_settings(settings, "nodes", num_nodes, num_cpus) + + # Test when setting select through the constructor via resource + settings = QsubBatchSettings(ncpus=num_cpus, resources={"select": num_nodes}) + validate_settings(settings, "select", num_nodes, num_cpus) + + +def test_select_nodes_error(): + # # Test failure on initialization + with pytest.raises(SSConfigError): + QsubBatchSettings(nodes=10, resources={"select": 10}) + + # Test setting via nodes and then select + settings = QsubBatchSettings() + settings.set_nodes(10) + with pytest.raises(SSConfigError): + settings.set_resource("select", 10) + + # Manually put "select" in the resource dictionary and + # make sure the resource formatter catches the error + settings = QsubBatchSettings() + with pytest.raises(SSConfigError): + settings.resources = {"nodes": 10, "select": 20} + + # # Test setting via select and then nodes + settings = QsubBatchSettings() + settings.set_resource("select", 10) + with pytest.raises(SSConfigError): + settings.set_nodes(10) + + +def test_resources_is_a_copy(): + settings = QsubBatchSettings() + resources = settings.resources + assert resources is not settings._resources + + +def test_nodes_and_select_not_ints_error(): + expected_error = TypeError + with pytest.raises(expected_error): + settings = QsubBatchSettings() + settings.set_nodes("10") + with pytest.raises(expected_error): + settings = QsubBatchSettings() + settings.set_resource("nodes", "10") + with pytest.raises(expected_error): + settings = QsubBatchSettings() + settings.set_resource("select", "10") + with pytest.raises(expected_error): + settings = QsubBatchSettings() + settings.resources = {"nodes": "10"} + with pytest.raises(expected_error): + settings = QsubBatchSettings() + settings.resources = {"select": "10"} + + +def test_resources_not_set_on_error(): + settings = QsubBatchSettings(nodes=10) + unaltered_resources = settings.resources + with pytest.raises(SSConfigError): + settings.resources = {"nodes": 10, "select": 10} + + assert unaltered_resources == settings.resources + + +def test_valid_types_in_resources(): + settings = QsubBatchSettings(nodes=10) + with pytest.raises(TypeError): + settings.set_resource("foo", None) diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index 609ba48e7..0faa92242 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -24,25 +24,29 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import os.path as osp import time +import pytest + from smartsim import Experiment, status from smartsim.database import Orchestrator +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + first_dir = "" # TODO ensure database is shutdown # use https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test -def test_local_orchestrator(fileutils, wlmutils): +def test_local_orchestrator(test_dir, wlmutils): """Test launching orchestrator locally""" global first_dir exp_name = "test-orc-launch-local" - exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) first_dir = test_dir orc = Orchestrator(port=wlmutils.get_test_port()) @@ -57,12 +61,12 @@ def test_local_orchestrator(fileutils, wlmutils): exp._control._launcher.task_manager.actively_monitoring = False -def test_reconnect_local_orc(): +def test_reconnect_local_orc(test_dir): """Test reconnecting to orchestrator from first experiment""" global first_dir # start new experiment exp_name = "test-orc-local-reconnect-2nd" - exp_2 = Experiment(exp_name, launcher="local") + exp_2 = Experiment(exp_name, launcher="local", exp_path=test_dir) checkpoint = osp.join(first_dir, "smartsim_db.dat") reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) diff --git a/tests/test_run_settings.py b/tests/test_run_settings.py index 8b47cb5d6..7bcd6d874 100644 --- a/tests/test_run_settings.py +++ b/tests/test_run_settings.py @@ -42,6 +42,9 @@ ) from smartsim.settings.settings import create_run_settings +# The tests in this file belong to the slow_tests group +pytestmark = pytest.mark.slow_tests + def test_create_run_settings_local(): # no run command provided diff --git a/tests/test_serialize.py b/tests/test_serialize.py new file mode 100644 index 000000000..167e7e445 --- /dev/null +++ b/tests/test_serialize.py @@ -0,0 +1,175 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import logging +from pathlib import Path + +import pytest + +import smartsim._core.config.config +from smartsim import Experiment +from smartsim._core._cli import utils +from smartsim._core.control.manifest import LaunchedManifestBuilder +from smartsim._core.utils import serialize +from smartsim.database.orchestrator import Orchestrator + +_REL_MANIFEST_PATH = f"{serialize.TELMON_SUBDIR}/{serialize.MANIFEST_FILENAME}" +_CFG_TM_ENABLED_ATTR = "telemetry_enabled" + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + +@pytest.fixture(autouse=True) +def turn_on_tm(monkeypatch): + monkeypatch.setattr( + smartsim._core.config.config.Config, + _CFG_TM_ENABLED_ATTR, + property(lambda self: True), + ) + yield + + +def test_serialize_creates_a_manifest_json_file_if_dne(test_dir): + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + serialize.save_launch_manifest(lmb.finalize()) + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + + assert manifest_json.is_file() + with open(manifest_json, "r") as f: + manifest = json.load(f) + assert manifest["experiment"]["name"] == "exp" + assert manifest["experiment"]["launcher"] == "launcher" + assert isinstance(manifest["runs"], list) + assert len(manifest["runs"]) == 1 + + +def test_serialize_does_not_write_manifest_json_if_telemetry_monitor_is_off( + test_dir, monkeypatch +): + monkeypatch.setattr( + smartsim._core.config.config.Config, + _CFG_TM_ENABLED_ATTR, + property(lambda self: False), + ) + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + serialize.save_launch_manifest(lmb.finalize()) + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + assert not manifest_json.exists() + + +def test_serialize_appends_a_manifest_json_exists(test_dir): + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + serialize.save_launch_manifest( + LaunchedManifestBuilder("exp", test_dir, "launcher").finalize() + ) + serialize.save_launch_manifest( + LaunchedManifestBuilder("exp", test_dir, "launcher").finalize() + ) + serialize.save_launch_manifest( + LaunchedManifestBuilder("exp", test_dir, "launcher").finalize() + ) + + assert manifest_json.is_file() + with open(manifest_json, "r") as f: + manifest = json.load(f) + assert isinstance(manifest["runs"], list) + assert len(manifest["runs"]) == 3 + assert len({run["run_id"] for run in manifest["runs"]}) == 3 + + +def test_serialize_overwites_file_if_not_json(test_dir): + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + manifest_json.parent.mkdir(parents=True, exist_ok=True) + with open(manifest_json, "w") as f: + f.write("This is not a json\n") + + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + serialize.save_launch_manifest(lmb.finalize()) + with open(manifest_json, "r") as f: + assert isinstance(json.load(f), dict) + + +def test_started_entities_are_serialized(test_dir): + exp_name = "test-exp" + test_dir = Path(test_dir) / exp_name + test_dir.mkdir(parents=True) + exp = Experiment(exp_name, exp_path=str(test_dir), launcher="local") + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + hello_world_model = exp.create_model("echo-hello", run_settings=rs1) + spam_eggs_model = exp.create_model("echo-spam", run_settings=rs2) + hello_ensemble = exp.create_ensemble("echo-ensemble", run_settings=rs1, replicas=3) + + exp.generate(hello_world_model, spam_eggs_model, hello_ensemble) + exp.start(hello_world_model, spam_eggs_model, block=False) + exp.start(hello_ensemble, block=False) + + manifest_json = Path(exp.exp_path) / _REL_MANIFEST_PATH + try: + with open(manifest_json, "r") as f: + manifest = json.load(f) + assert len(manifest["runs"]) == 2 + assert len(manifest["runs"][0]["model"]) == 2 + assert len(manifest["runs"][0]["ensemble"]) == 0 + assert len(manifest["runs"][1]["model"]) == 0 + assert len(manifest["runs"][1]["ensemble"]) == 1 + assert len(manifest["runs"][1]["ensemble"][0]["models"]) == 3 + finally: + exp.stop(hello_world_model, spam_eggs_model, hello_ensemble) + + +def test_serialzed_database_does_not_break_if_using_a_non_standard_install(monkeypatch): + monkeypatch.setattr(utils, "get_db_path", lambda: None) + db = Orchestrator() + dict_ = serialize._dictify_db(db, []) + assert dict_["type"] == "Unknown" + + +def test_dictify_run_settings_warns_when_attepting_to_dictify_mpmd( + monkeypatch, caplog, test_dir +): + # TODO: Eventually this test should be removed and we should be able to + # handle MPMD run settings as part of the output dict + exp_name = "test-exp" + test_dir = Path(test_dir) / exp_name + test_dir.mkdir(parents=True) + exp = Experiment(exp_name, exp_path=str(test_dir), launcher="local") + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + # Make rs "MPMD" + monkeypatch.setattr(rs1, "mpmd", [rs2], raising=False) + # Make work with colored logs + monkeypatch.setattr(serialize, "_LOGGER", logging.getLogger()) + serialize._dictify_run_settings(rs1) + (rec,) = caplog.records + assert rec.levelno == logging.WARNING + assert "MPMD run settings" in rec.msg diff --git a/tests/test_shell_util.py b/tests/test_shell_util.py index 4ba3566d4..7b7ac55b7 100644 --- a/tests/test_shell_util.py +++ b/tests/test_shell_util.py @@ -30,6 +30,9 @@ from smartsim._core.launcher.util.shell import * +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + def test_execute_cmd(): returncode, out, err = execute_cmd(["hostname"]) diff --git a/tests/test_slurm_get_alloc.py b/tests/test_slurm_get_alloc.py index de488b25f..270bbf014 100644 --- a/tests/test_slurm_get_alloc.py +++ b/tests/test_slurm_get_alloc.py @@ -24,9 +24,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest from smartsim.slurm import _get_alloc_cmd +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + def test_get_alloc_format(): time = "10:00:00" diff --git a/tests/test_slurm_parser.py b/tests/test_slurm_parser.py index 0674d9fe9..30c6c5b31 100644 --- a/tests/test_slurm_parser.py +++ b/tests/test_slurm_parser.py @@ -24,8 +24,14 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + from smartsim._core.launcher.slurm import slurmParser +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + # -- Salloc --------------------------------------------------------- diff --git a/tests/test_slurm_settings.py b/tests/test_slurm_settings.py index da18a0133..d6bfd5063 100644 --- a/tests/test_slurm_settings.py +++ b/tests/test_slurm_settings.py @@ -24,14 +24,15 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import logging -import os - import pytest from smartsim.error import SSUnsupportedError from smartsim.settings import SbatchSettings, SrunSettings +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + # ------ Srun ------------------------------------------------ diff --git a/tests/test_slurm_validation.py b/tests/test_slurm_validation.py index 585ae2d16..c3f796ba6 100644 --- a/tests/test_slurm_validation.py +++ b/tests/test_slurm_validation.py @@ -30,6 +30,10 @@ from smartsim.error.errors import LauncherError from smartsim.slurm import _get_system_partition_info, get_default_partition, validate +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + # retrieved from pytest fixtures if pytest.test_launcher != "slurm": pytestmark = pytest.mark.skip(reason="Test is only for Slurm WLM systems") diff --git a/tests/test_smartredis.py b/tests/test_smartredis.py index c44969ce1..2f234c217 100644 --- a/tests/test_smartredis.py +++ b/tests/test_smartredis.py @@ -32,6 +32,10 @@ from smartsim.database import Orchestrator from smartsim.entity import Ensemble, Model +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + """Test smartredis integration for ensembles. Two copies of the same program will be executed concurrently, and name collisions will be avoided through smartredis prefixing: @@ -55,13 +59,12 @@ ) -def test_exchange(fileutils, wlmutils): +def test_exchange(fileutils, test_dir, wlmutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. """ - test_dir = fileutils.make_test_dir() exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" ) @@ -93,24 +96,21 @@ def test_exchange(fileutils, wlmutils): # get and confirm statuses statuses = exp.get_status(ensemble) - if not all([stat == status.STATUS_COMPLETED for stat in statuses]): + try: + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + # stop the orchestrator exp.stop(orc) - assert False # client ensemble failed - - # stop the orchestrator - exp.stop(orc) - - print(exp.summary()) -def test_consumer(fileutils, wlmutils): +def test_consumer(fileutils, test_dir, wlmutils): """Run three processes, each one of the first two processes puts a tensor on the DB; the third process accesses the tensors put by the two producers. Finally, the tensor is used to run a model by each producer and the consumer accesses the two results. """ - test_dir = fileutils.make_test_dir() + exp = Experiment( "smartredis_ensemble_consumer", exp_path=test_dir, launcher="local" ) @@ -145,11 +145,8 @@ def test_consumer(fileutils, wlmutils): # get and confirm statuses statuses = exp.get_status(ensemble) - if not all([stat == status.STATUS_COMPLETED for stat in statuses]): + try: + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + # stop the orchestrator exp.stop(orc) - assert False # client ensemble failed - - # stop the orchestrator - exp.stop(orc) - - print(exp.summary()) diff --git a/tests/test_step_info.py b/tests/test_step_info.py index 75b93f15e..eee920192 100644 --- a/tests/test_step_info.py +++ b/tests/test_step_info.py @@ -24,10 +24,14 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest from smartsim import status from smartsim._core.launcher.stepInfo import * +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + def test_str(): step_info = StepInfo( diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py new file mode 100644 index 000000000..3f804b077 --- /dev/null +++ b/tests/test_telemetry_monitor.py @@ -0,0 +1,1114 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import logging +import pathlib +import sys +import time +import typing as t +import uuid + +import pytest + +import smartsim._core.config.config as cfg +from conftest import FileUtils, WLMUtils +from smartsim import Experiment +from smartsim._core.control.job import Job, JobEntity +from smartsim._core.control.jobmanager import JobManager +from smartsim._core.entrypoints.telemetrymonitor import ( + ManifestEventHandler, + can_shutdown, + event_loop, + faux_return_code, + get_parser, + get_ts, + hydrate_persistable, + load_manifest, + track_event, +) +from smartsim._core.launcher.launcher import WLMLauncher +from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher +from smartsim._core.launcher.step.step import Step, proxyable_launch_cmd +from smartsim._core.launcher.stepInfo import StepInfo +from smartsim._core.utils import serialize +from smartsim.error.errors import UnproxyableStepError +from smartsim.settings.base import RunSettings +from smartsim.status import ( + STATUS_CANCELLED, + STATUS_COMPLETED, + STATUS_FAILED, + STATUS_NEW, + STATUS_PAUSED, + STATUS_RUNNING, +) + +ALL_ARGS = {"-exp_dir", "-frequency"} +PROXY_ENTRY_POINT = "smartsim._core.entrypoints.indirect" +CFG_TM_ENABLED_ATTR = "telemetry_enabled" + + +for_all_wlm_launchers = pytest.mark.parametrize( + "wlm_launcher", + [pytest.param(cls(), id=cls.__name__) for cls in WLMLauncher.__subclasses__()], +) + +requires_wlm = pytest.mark.skipif( + pytest.test_launcher == "local", reason="Test requires WLM" +) + + +logger = logging.getLogger() + +# The tests in this file belong to the slow_tests group +pytestmark = pytest.mark.slow_tests + + +@pytest.fixture(autouse=True) +def turn_on_tm(monkeypatch): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, property(lambda self: True)) + yield + + +def snooze_nonblocking(test_dir: str, max_delay: int = 20, post_data_delay: int = 2): + telmon_subdir = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + # let the non-blocking experiment complete. + for _ in range(max_delay): + time.sleep(1) + if telmon_subdir.exists(): + time.sleep(post_data_delay) + break + + +@pytest.mark.parametrize( + ["cmd", "missing"], + [ + pytest.param("", {"-exp_dir", "-frequency"}, id="no args"), + pytest.param("-exp_dir /foo/bar", {"-frequency"}, id="no freq"), + pytest.param("-frequency 123", {"-exp_dir"}, id="no dir"), + ], +) +def test_parser_reqd_args(capsys, cmd, missing): + """Test that the parser reports any missing required arguments""" + parser = get_parser() + + args = cmd.split() + + captured = capsys.readouterr() # throw away existing output + with pytest.raises(SystemExit) as ex: + ns = parser.parse_args(args) + + captured = capsys.readouterr() + assert "the following arguments are required" in captured.err + err_desc = captured.err.split("the following arguments are required:")[-1] + for arg in missing: + assert arg in err_desc + + expected = ALL_ARGS - missing + for exp in expected: + assert exp not in err_desc + + +def test_parser(): + """Test that the parser succeeds when receiving expected args""" + parser = get_parser() + + test_dir = "/foo/bar" + test_freq = 123 + + cmd = f"-exp_dir {test_dir} -frequency {test_freq}" + args = cmd.split() + + ns = parser.parse_args(args) + + assert ns.exp_dir == test_dir + assert ns.frequency == test_freq + + +def test_ts(): + """Ensure expected output type""" + ts = get_ts() + assert isinstance(ts, int) + + +@pytest.mark.parametrize( + ["etype", "task_id", "step_id", "timestamp", "evt_type"], + [ + pytest.param("ensemble", "", "123", get_ts(), "start", id="start event"), + pytest.param("ensemble", "", "123", get_ts(), "stop", id="stop event"), + ], +) +def test_track_event( + etype: str, + task_id: str, + step_id: str, + timestamp: int, + evt_type: str, + test_dir: str, +): + """Ensure that track event writes a file to the expected location""" + exp_path = pathlib.Path(test_dir) + track_event(timestamp, task_id, step_id, etype, evt_type, exp_path, logger) + + expected_output = exp_path / f"{evt_type}.json" + + assert expected_output.exists() + assert expected_output.is_file() + + +def test_load_manifest(fileutils: FileUtils, test_dir: str): + """Ensure that the runtime manifest loads correctly""" + sample_manifest_path = fileutils.get_test_conf_path("telemetry/telemetry.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + test_manifest_path = fileutils.make_test_file( + serialize.MANIFEST_FILENAME, + pathlib.Path(test_dir) / serialize.TELMON_SUBDIR, + sample_manifest.read_text(), + ) + test_manifest = pathlib.Path(test_manifest_path) + assert test_manifest.exists() + + manifest = load_manifest(test_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/path/to/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 6 + + assert len(manifest.runs[0].models) == 1 + assert len(manifest.runs[2].models) == 8 # 8 models in ensemble + assert len(manifest.runs[0].orchestrators) == 0 + assert len(manifest.runs[1].orchestrators) == 3 # 3 shards in db + + +def test_load_manifest_colo_model(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing a colocated model""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path("telemetry/colocatedmodel.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/tmp/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 1 + + assert len(manifest.runs[0].models) == 1 + + +def test_load_manifest_serial_models(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing multiple models""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path("telemetry/serialmodels.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/tmp/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 1 + + assert len(manifest.runs[0].models) == 5 + + +def test_load_manifest_db_and_models(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing models & + orchestrator across 2 separate runs""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path("telemetry/db_and_model.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/tmp/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 2 + + assert len(manifest.runs[0].orchestrators) == 1 + assert len(manifest.runs[1].models) == 1 + + +def test_load_manifest_db_and_models_1run(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing models & + orchestrator in a single run""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path( + "telemetry/db_and_model_1run.json" + ) + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/tmp/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 1 + + assert len(manifest.runs[0].orchestrators) == 1 + assert len(manifest.runs[0].models) == 1 + + +@pytest.mark.parametrize( + ["task_id", "step_id", "etype", "exp_isorch", "exp_ismanaged"], + [ + pytest.param("123", "", "model", False, False, id="unmanaged, non-orch"), + pytest.param("456", "123", "ensemble", False, True, id="managed, non-orch"), + pytest.param("789", "987", "orchestrator", True, True, id="managed, orch"), + pytest.param("987", "", "orchestrator", True, False, id="unmanaged, orch"), + ], +) +def test_persistable_computed_properties( + task_id: str, step_id: str, etype: str, exp_isorch: bool, exp_ismanaged: bool +): + name = f"test-{etype}-{uuid.uuid4()}" + timestamp = get_ts() + exp_dir = pathlib.Path("/foo/bar") + stored = { + "name": name, + "run_id": timestamp, + "telemetry_metadata": { + "status_dir": str(exp_dir), + "task_id": task_id, + "step_id": step_id, + }, + } + persistables = hydrate_persistable(etype, stored, exp_dir) + persistable = persistables[0] if persistables else None + + assert persistable.is_managed == exp_ismanaged + assert persistable.is_db == exp_isorch + + +def test_deserialize_ensemble(fileutils: FileUtils): + """Ensure that the children of ensembles (models) are correctly + placed in the models collection""" + sample_manifest_path = fileutils.get_test_conf_path("telemetry/ensembles.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest + + assert len(manifest.runs) == 1 + + # NOTE: no longer returning ensembles, only children... + # assert len(manifest.runs[0].ensembles) == 1 + assert len(manifest.runs[0].models) == 8 + + +def test_shutdown_conditions(): + """Ensure conditions to shutdown telemetry monitor are correctly evaluated""" + job_entity1 = JobEntity() + job_entity1.name = "xyz" + job_entity1.step_id = "123" + job_entity1.task_id = "" + + logger = logging.getLogger() + + # show that an event handler w/no monitored jobs can shutdown + mani_handler = ManifestEventHandler("xyz", logger) + assert can_shutdown(mani_handler, logger) + + # show that an event handler w/a monitored job cannot shutdown + mani_handler = ManifestEventHandler("xyz", logger) + mani_handler.job_manager.add_job( + job_entity1.name, job_entity1.step_id, job_entity1, False + ) + assert not can_shutdown(mani_handler, logger) + assert not bool(mani_handler.job_manager.db_jobs) + assert bool(mani_handler.job_manager.jobs) + + # show that an event handler w/a monitored db cannot shutdown + mani_handler = ManifestEventHandler("xyz", logger) + job_entity1.type = "orchestrator" + mani_handler.job_manager.add_job( + job_entity1.name, job_entity1.step_id, job_entity1, False + ) + assert not can_shutdown(mani_handler, logger) + assert bool(mani_handler.job_manager.db_jobs) + assert not bool(mani_handler.job_manager.jobs) + + # show that an event handler w/a dbs & tasks cannot shutdown + job_entity2 = JobEntity() + job_entity2.name = "xyz" + job_entity2.step_id = "123" + job_entity2.task_id = "" + + mani_handler = ManifestEventHandler("xyz", logger) + job_entity1.type = "orchestrator" + mani_handler.job_manager.add_job( + job_entity1.name, job_entity1.step_id, job_entity1, False + ) + + mani_handler.job_manager.add_job( + job_entity2.name, job_entity2.step_id, job_entity2, False + ) + assert not can_shutdown(mani_handler, logger) + assert bool(mani_handler.job_manager.db_jobs) + assert bool(mani_handler.job_manager.jobs) + + # ... now, show that removing 1 of 2 jobs still doesn't shutdown + mani_handler.job_manager.db_jobs.popitem() + assert not can_shutdown(mani_handler, logger) + + # ... now, show that removing final job will allow shutdown + mani_handler.job_manager.jobs.popitem() + assert can_shutdown(mani_handler, logger) + + +def test_auto_shutdown(): + """Ensure that the cooldown timer is respected""" + + class FauxObserver: + def __init__(self): + self.stop_count = 0 + + def stop(self): + self.stop_count += 1 + + def is_alive(self) -> bool: + if self.stop_count > 0: + return False + + return True + + job_entity1 = JobEntity() + job_entity1.name = "xyz" + job_entity1.step_id = "123" + job_entity1.task_id = "" + + frequency = 1 + + # show that an event handler w/out a monitored task will automatically stop + mani_handler = ManifestEventHandler("xyz", logger) + observer = FauxObserver() + duration = 2 + + ts0 = get_ts() + event_loop(observer, mani_handler, frequency, logger, cooldown_duration=duration) + ts1 = get_ts() + + assert ts1 - ts0 >= duration + assert observer.stop_count == 1 + + # show that the new cooldown duration is respected + mani_handler = ManifestEventHandler("xyz", logger) + observer = FauxObserver() + duration = 5 + + ts0 = get_ts() + event_loop(observer, mani_handler, frequency, logger, cooldown_duration=duration) + ts1 = get_ts() + + assert ts1 - ts0 >= duration + assert observer.stop_count == 1 + + +def test_telemetry_single_model(fileutils, test_dir, wlmutils): + """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp + with unique db_identifiers""" + + # Set experiment name + exp_name = "telemetry_single_model" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("perroquet", app_settings) + exp.generate(smartsim_model) + exp.start(smartsim_model, block=True) + assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +def test_telemetry_single_model_nonblocking(fileutils, test_dir, wlmutils, monkeypatch): + """Ensure that the telemetry monitor logs exist when the experiment + is non-blocking""" + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "test_telemetry_single_model_nonblocking" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("perroquet", app_settings) + exp.generate(smartsim_model) + exp.start(smartsim_model) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + + assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch): + """ + Test telemetry with models being run in serial (one after each other) + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_serial_models" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_models = [ + exp.create_model(f"perroquet_{i}", app_settings) for i in range(5) + ] + exp.generate(*smartsim_models) + exp.start(*smartsim_models, block=True) + assert all( + [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] + ) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 5 + assert len(stop_events) == 5 + + +def test_telemetry_serial_models_nonblocking( + fileutils, test_dir, wlmutils, monkeypatch +): + """ + Test telemetry with models being run in serial (one after each other) + in a non-blocking experiment + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_serial_models" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_models = [ + exp.create_model(f"perroquet_{i}", app_settings) for i in range(5) + ] + exp.generate(*smartsim_models) + exp.start(*smartsim_models) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + + assert all( + [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] + ) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 5 + assert len(stop_events) == 5 + + +def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch): + """ + Test telemetry with only a database running + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_db_with_generate" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create regular database + orc = exp.create_database(port=test_port, interface=test_interface) + exp.generate(orc) + try: + exp.start(orc, block=True) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) <= 1 + finally: + exp.stop(orc) + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + + assert exp.get_status(orc)[0] == STATUS_CANCELLED + + stop_events = list(telemetry_output_path.rglob("stop.json")) + assert len(stop_events) == 1 + + +def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch): + """ + Test telemetry with only a non-generated database running + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_db_only_without_generate" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create regular database + orc = exp.create_database(port=test_port, interface=test_interface) + orc.set_path(test_dir) + + try: + exp.start(orc) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 0 + finally: + exp.stop(orc) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + assert exp.get_status(orc)[0] == STATUS_CANCELLED + + stop_events = list(telemetry_output_path.rglob("stop.json")) + assert len(stop_events) == 1 + + +def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch): + """ + Test telemetry with only a database and a model running + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_db_and_model" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create regular database + orc = exp.create_database(port=test_port, interface=test_interface) + exp.generate(orc) + try: + exp.start(orc) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("perroquet", app_settings) + exp.generate(smartsim_model) + exp.start(smartsim_model, block=True) + finally: + exp.stop(orc) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + + assert exp.get_status(orc)[0] == STATUS_CANCELLED + assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + + start_events = list(telemetry_output_path.rglob("database/**/start.json")) + stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 1 + + start_events = list(telemetry_output_path.rglob("model/**/start.json")) + stop_events = list(telemetry_output_path.rglob("model/**/stop.json")) + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch): + """ + Test telemetry with only an ensemble + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_ensemble" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + ens = exp.create_ensemble("troupeau", run_settings=app_settings, replicas=5) + exp.generate(ens) + exp.start(ens, block=True) + assert all([status == STATUS_COMPLETED for status in exp.get_status(ens)]) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 5 + assert len(stop_events) == 5 + + +def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch): + """ + Test telemetry with only a colocated model running + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_colo" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + smartsim_model = coloutils.setup_test_colo( + fileutils, + "uds", + exp, + "echo.py", + {}, + ) + + exp.generate(smartsim_model) + exp.start(smartsim_model, block=True) + assert all( + [status == STATUS_COMPLETED for status in exp.get_status(smartsim_model)] + ) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + # the colodb does NOT show up as a unique entity in the telemetry + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +@pytest.mark.parametrize( + "frequency, cooldown", + [ + pytest.param(1, 1, id="1s shutdown"), + pytest.param(1, 5, id="5s shutdown"), + pytest.param(1, 15, id="15s shutdown"), + ], +) +def test_telemetry_autoshutdown(test_dir, wlmutils, monkeypatch, frequency, cooldown): + """ + Ensure that the telemetry monitor process shuts down after the desired + cooldown period + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", frequency) + ctx.setattr(cfg.Config, "telemetry_cooldown", cooldown) + + # Set experiment name + exp_name = "telemetry_ensemble" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + start_time = get_ts() + stop_time = start_time + exp.start(block=False) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + empty_mani = list(telemetry_output_path.rglob("manifest.json")) + assert len(empty_mani) == 1, "an manifest.json should be created" + + popen = exp._control._telemetry_monitor + assert popen.pid > 0 + assert popen.returncode is None + + # give some leeway during testing for the cooldown to get hit + for i in range(10): + if popen.poll() is not None: + stop_time = get_ts() + print(f"Completed polling for telemetry shutdown after {i} attempts") + break + time.sleep(3) + + assert popen.returncode is not None + assert stop_time >= (start_time + cooldown) + + +class MockStep(Step): + """Mock step to implement any abstract methods so that it can be + instanced for test purposes + """ + + def get_launch_cmd(self): + return ["spam", "eggs"] + + +@pytest.fixture +def mock_step_meta_dict(test_dir): + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + yield { + "entity_type": "mock", + "status_dir": telemetry_output_path, + } + + +@pytest.fixture +def mock_step(test_dir, mock_step_meta_dict): + rs = RunSettings("echo") + step = MockStep("mock-step", test_dir, rs) + step.meta = mock_step_meta_dict + yield step + + +def test_proxy_launch_cmd_decorator_reformats_cmds(mock_step, monkeypatch): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) + get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) + cmd = get_launch_cmd(mock_step) + assert cmd != ["some", "cmd", "list"] + assert sys.executable in cmd + assert PROXY_ENTRY_POINT in cmd + + +def test_proxy_launch_cmd_decorator_does_not_reformat_cmds_if_the_tm_is_off( + mock_step, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, False) + get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) + cmd = get_launch_cmd(mock_step) + assert cmd == ["some", "cmd", "list"] + + +def test_proxy_launch_cmd_decorator_errors_if_attempt_to_proxy_a_managed_step( + mock_step, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) + mock_step.managed = True + get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) + with pytest.raises(UnproxyableStepError): + get_launch_cmd(mock_step) + + +@for_all_wlm_launchers +def test_unmanaged_steps_are_proxyed_through_indirect( + wlm_launcher, mock_step_meta_dict, test_dir, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) + rs = RunSettings("echo", ["hello", "world"]) + step = wlm_launcher.create_step("test-step", test_dir, rs) + step.meta = mock_step_meta_dict + assert isinstance(step, Step) + assert not step.managed + cmd = step.get_launch_cmd() + assert sys.executable in cmd + assert PROXY_ENTRY_POINT in cmd + assert "hello" not in cmd + assert "world" not in cmd + + +@for_all_wlm_launchers +def test_unmanaged_steps_are_not_proxied_if_the_telemetry_monitor_is_disabled( + wlm_launcher, mock_step_meta_dict, test_dir, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, False) + rs = RunSettings("echo", ["hello", "world"]) + step = wlm_launcher.create_step("test-step", test_dir, rs) + step.meta = mock_step_meta_dict + assert isinstance(step, Step) + assert not step.managed + cmd = step.get_launch_cmd() + assert PROXY_ENTRY_POINT not in cmd + assert "hello" in cmd + assert "world" in cmd + + +@requires_wlm +@pytest.mark.parametrize( + "run_command", + [ + pytest.param("", id="Unmanaged"), + pytest.param("auto", id="Managed"), + ], +) +def test_multistart_experiment( + wlmutils: WLMUtils, + fileutils: FileUtils, + test_dir: str, + monkeypatch: pytest.MonkeyPatch, + run_command: str, +): + """Run an experiment with multiple start calls to ensure that telemetry is + saved correctly for each run + """ + + exp_name = "my-exp" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + rs_e = exp.create_run_settings( + sys.executable, ["printing_model.py"], run_command=run_command + ) + rs_e.set_nodes(1) + rs_e.set_tasks(1) + ens = exp.create_ensemble( + "my-ens", + run_settings=rs_e, + perm_strategy="all_perm", + params={ + "START": ["spam"], + "MID": ["eggs"], + "END": ["sausage", "and spam"], + }, + ) + + test_script_path = fileutils.get_test_conf_path("printing_model.py") + ens.attach_generator_files(to_configure=[test_script_path]) + + rs_m = exp.create_run_settings("echo", ["hello", "world"], run_command=run_command) + rs_m.set_nodes(1) + rs_m.set_tasks(1) + model = exp.create_model("my-model", run_settings=rs_m) + + db = exp.create_database( + db_nodes=1, + port=wlmutils.get_test_port(), + interface=wlmutils.get_test_interface(), + ) + + exp.generate(db, ens, model, overwrite=True) + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + ctx.setattr(cfg.Config, "telemetry_cooldown", 45) + + exp.start(model, block=False) + + # track PID to see that telmon cooldown avoids restarting process + tm_pid = exp._control._telemetry_monitor.pid + + exp.start(db, block=False) + # check that same TM proc is active + assert tm_pid == exp._control._telemetry_monitor.pid + try: + exp.start(ens, block=True, summary=True) + finally: + exp.stop(db) + assert tm_pid == exp._control._telemetry_monitor.pid + time.sleep(3) # time for telmon to write db stop event + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + + db_start_events = list(telemetry_output_path.rglob("database/**/start.json")) + db_stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) + assert len(db_start_events) == 1 + assert len(db_stop_events) == 1 + + m_start_events = list(telemetry_output_path.rglob("model/**/start.json")) + m_stop_events = list(telemetry_output_path.rglob("model/**/stop.json")) + assert len(m_start_events) == 1 + assert len(m_stop_events) == 1 + + e_start_events = list(telemetry_output_path.rglob("ensemble/**/start.json")) + e_stop_events = list(telemetry_output_path.rglob("ensemble/**/stop.json")) + assert len(e_start_events) == 2 + assert len(e_stop_events) == 2 + + +@pytest.mark.parametrize( + "status_in, expected_out", + [ + pytest.param(STATUS_CANCELLED, 1, id="failure on cancellation"), + pytest.param(STATUS_COMPLETED, 0, id="success on completion"), + pytest.param(STATUS_FAILED, 1, id="failure on failed"), + pytest.param(STATUS_NEW, None, id="failure on new"), + pytest.param(STATUS_PAUSED, None, id="failure on paused"), + pytest.param(STATUS_RUNNING, None, id="failure on running"), + ], +) +def test_faux_rc(status_in: str, expected_out: t.Optional[int]): + """Ensure faux response codes match expectations.""" + step_info = StepInfo(status=status_in) + + rc = faux_return_code(step_info) + assert rc == expected_out + + +@pytest.mark.parametrize( + "status_in, expected_out, expected_has_jobs", + [ + pytest.param(STATUS_CANCELLED, 1, False, id="failure on cancellation"), + pytest.param(STATUS_COMPLETED, 0, False, id="success on completion"), + pytest.param(STATUS_FAILED, 1, False, id="failure on failed"), + pytest.param(STATUS_NEW, None, True, id="failure on new"), + pytest.param(STATUS_PAUSED, None, True, id="failure on paused"), + pytest.param(STATUS_RUNNING, None, True, id="failure on running"), + ], +) +def test_wlm_completion_handling( + test_dir: str, + monkeypatch: pytest.MonkeyPatch, + status_in: str, + expected_out: t.Optional[int], + expected_has_jobs: bool, +): + def get_faux_update(status: str) -> t.Callable: + def _faux_updates(_self: WLMLauncher, _names: t.List[str]) -> t.List[StepInfo]: + return [("faux-name", StepInfo(status=status))] + + return _faux_updates + + ts = get_ts() + with monkeypatch.context() as ctx: + # don't actually start a job manager + ctx.setattr(JobManager, "start", lambda x: ...) + ctx.setattr(SlurmLauncher, "get_step_update", get_faux_update(status_in)) + + mani_handler = ManifestEventHandler("xyz", logger) + mani_handler.set_launcher("slurm") + + # prep a fake job to request updates for + job_entity = JobEntity() + job_entity.name = "faux-name" + job_entity.step_id = "faux-step-id" + job_entity.task_id = 1234 + job_entity.status_dir = test_dir + job_entity.type = "orchestrator" + + job = Job(job_entity.name, job_entity.step_id, job_entity, "slurm", True) + + # populate our tracking collections + mani_handler._tracked_jobs = {job_entity.key: job_entity} + mani_handler.job_manager.jobs[job.name] = job + + mani_handler.on_timestep(ts) + + # see that the job queue was properly manipulated + has_jobs = bool(mani_handler._tracked_jobs) + assert expected_has_jobs == has_jobs + + # see that the event was properly written + stop_event_path = pathlib.Path(test_dir) / "stop.json" + + # if a status wasn't terminal, no stop event should have been written + should_have_stop_event = False if expected_out is None else True + assert should_have_stop_event == stop_event_path.exists() diff --git a/tutorials/getting_started/consumer.py b/tutorials/getting_started/consumer.py index 0a16c8a26..aef71f220 100644 --- a/tutorials/getting_started/consumer.py +++ b/tutorials/getting_started/consumer.py @@ -1,13 +1,16 @@ import argparse import os -from smartredis import Client +from smartredis import Client, ConfigOptions parser = argparse.ArgumentParser(description="SmartRedis ensemble consumer process.") parser.add_argument("--redis-port") args = parser.parse_args() # get model and set into database -c = Client(address="127.0.0.1:"+str(args.redis_port), cluster=False) +address = "127.0.0.1:" + str(args.redis_port) +os.environ["SSDB"] = address +c = Client(None, logger_name="SmartSim") + # Incoming entity prefixes are stored as a comma-separated list # in the env variable SSKEYIN diff --git a/tutorials/getting_started/multi_db_example/application_script.py b/tutorials/getting_started/multi_db_example/application_script.py new file mode 100644 index 000000000..239c16684 --- /dev/null +++ b/tutorials/getting_started/multi_db_example/application_script.py @@ -0,0 +1,37 @@ +from smartredis import ConfigOptions, Client +from smartredis import * +from smartredis.error import * + +# Initialize a ConfigOptions object +single_shard_config = ConfigOptions.create_from_environment("single_shard_db_identifier") +# Initialize a SmartRedis client for the single sharded database +app_single_shard_client = Client(single_shard_config, logger_name="Model: single shard logger") + +# Initialize a ConfigOptions object +multi_shard_config = ConfigOptions.create_from_environment("multi_shard_db_identifier") +# Initialize a SmartRedis client for the multi sharded database +app_multi_shard_client = Client(multi_shard_config, logger_name="Model: multi shard logger") + +# Initialize a ConfigOptions object +colo_config = ConfigOptions.create_from_environment("colo_db_identifier") +# Initialize a SmartRedis client for the colocated database +colo_client = Client(colo_config, logger_name="Model: colo logger") + +# Retrieve the tensor placed in driver script using the associated client +val1 = app_single_shard_client.get_tensor("tensor_1") +val2 = app_multi_shard_client.get_tensor("tensor_2") + +# Print message to stdout using SmartRedis Client logger +app_single_shard_client.log_data(LLInfo, f"The single sharded db tensor is: {val1}") +app_multi_shard_client.log_data(LLInfo, f"The multi sharded db tensor is: {val2}") + +# Place retrieved tensors in colocated database +colo_client.put_tensor("tensor_1", val1) +colo_client.put_tensor("tensor_2", val2) + +# Check that tensors are in colocated database +colo_val1 = colo_client.poll_tensor("tensor_1", 10, 10) +colo_val2 = colo_client.poll_tensor("tensor_2", 10, 10) +# Print message to stdout using SmartRedis Client logger +colo_client.log_data(LLInfo, f"The colocated db has tensor_1: {colo_val1}") +colo_client.log_data(LLInfo, f"The colocated db has tensor_2: {colo_val2}") \ No newline at end of file diff --git a/tutorials/getting_started/multi_db_example/multidb_driver.py b/tutorials/getting_started/multi_db_example/multidb_driver.py new file mode 100644 index 000000000..fae6a9b15 --- /dev/null +++ b/tutorials/getting_started/multi_db_example/multidb_driver.py @@ -0,0 +1,59 @@ +import numpy as np +from smartredis import Client +from smartsim import Experiment +from smartsim.log import get_logger +import sys + +exe_ex = sys.executable +logger = get_logger("Multidb Experiment Log") +# Initialize the Experiment +exp = Experiment("getting-started-multidb", launcher="auto") + +# Initialize a single sharded database +single_shard_db = exp.create_database(port=6379, db_nodes=1, interface="ib0", db_identifier="single_shard_db_identifier") +exp.generate(single_shard_db, overwrite=True) + +# Initialize a multi sharded database +multi_shard_db = exp.create_database(port=6380, db_nodes=3, interface="ib0", db_identifier="multi_shard_db_identifier") +exp.generate(multi_shard_db, overwrite=True) + +# Launch the single and multi sharded database +exp.start(single_shard_db, multi_shard_db, summary=True) + +# Initialize SmartRedis client for single sharded database +driver_client_single_shard = Client(cluster=False, address=single_shard_db.get_address()[0], logger_name="Single shard db logger") +# Initialize SmartRedis client for multi sharded database +driver_client_multi_shard = Client(cluster=True, address=multi_shard_db.get_address()[0], logger_name="Multi shard db logger") + +# Create NumPy array +array_1 = np.array([1, 2, 3, 4]) +# Use single shard db SmartRedis client to place tensor in single sharded db +driver_client_single_shard.put_tensor("tensor_1", array_1) + +# Create NumPy array +array_2 = np.array([5, 6, 7, 8]) +# Use single shard db SmartRedis client to place tensor in multi sharded db +driver_client_multi_shard.put_tensor("tensor_2", array_2) + +# Check that tensors are in correct databases +check_single_shard_db_tensor_incorrect = driver_client_single_shard.key_exists("tensor_2") +check_multi_shard_db_tensor_incorrect = driver_client_multi_shard.key_exists("tensor_1") +logger.info(f"The multi shard array key exists in the incorrect database: {check_single_shard_db_tensor_incorrect}") +logger.info(f"The single shard array key exists in the incorrect database: {check_multi_shard_db_tensor_incorrect}") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe=exe_ex, exe_args="./path/to/application_script.py") +# Configure RunSettings object +model_settings.set_nodes(1) +model_settings.set_tasks_per_node(1) +# Initialize a SmartSim Model +model = exp.create_model("colo_model", model_settings) +# Colocate the Model +model.colocate_db_tcp(db_identifier="colo_db_identifier") +# Launch the colocated Model +exp.start(model, block=True, summary=True) + +# Tear down the single and multi sharded databases +exp.stop(single_shard_db, multi_shard_db) +# Print the Experiment summary +logger.info(exp.summary()) \ No newline at end of file diff --git a/tutorials/getting_started/producer.py b/tutorials/getting_started/producer.py index 731bfe402..9622ea7b4 100644 --- a/tutorials/getting_started/producer.py +++ b/tutorials/getting_started/producer.py @@ -1,14 +1,18 @@ import numpy as np import argparse import time +import os -from smartredis import Client +from smartredis import Client, ConfigOptions parser = argparse.ArgumentParser(description="SmartRedis ensemble producer process.") parser.add_argument("--redis-port") args = parser.parse_args() time.sleep(2) -c = Client(address="127.0.0.1:"+str(args.redis_port), cluster=False) +address = "127.0.0.1:" + str(args.redis_port) +os.environ["SSDB"] = address +c = Client(None, logger_name="SmartSim") + data = np.random.rand(1, 1, 3, 3) c.put_tensor("product", data) \ No newline at end of file diff --git a/tutorials/online_analysis/lattice/fv_sim.py b/tutorials/online_analysis/lattice/fv_sim.py index 7f5fcf06d..c9c75b88d 100644 --- a/tutorials/online_analysis/lattice/fv_sim.py +++ b/tutorials/online_analysis/lattice/fv_sim.py @@ -49,11 +49,15 @@ def finite_volume_simulation(steps=4000, x_res=400, y_res=100, # send every 5 time_step to reduce memory consumption if time_step % 5 == 0: - dataset = create_dataset(time_step, ux, uy, Feq) + dataset = create_dataset(time_step, ux, uy) client.put_dataset(dataset) + # send last time step to see final result + dataset = create_dataset(time_step, ux, uy) + client.put_dataset(dataset) -def create_dataset(time_step, ux, uy, feq): + +def create_dataset(time_step, ux, uy): """Create SmartRedis Dataset containing multiple NumPy arrays to be stored at a single key within the database""" dataset = Dataset(f"data_{time_step}") diff --git a/tutorials/online_analysis/lattice/online_analysis.ipynb b/tutorials/online_analysis/lattice/online_analysis.ipynb index 857fd2aee..48ddf6032 100644 --- a/tutorials/online_analysis/lattice/online_analysis.ipynb +++ b/tutorials/online_analysis/lattice/online_analysis.ipynb @@ -90,7 +90,7 @@ "\n", "from smartredis import Client\n", "from smartsim import Experiment\n", - "from vishelpers import plot_lattice_vorticity" + "from vishelpers import plot_lattice_vorticity, plot_lattice_norm, plot_lattice_probes" ] }, { @@ -139,7 +139,7 @@ } ], "source": [ - "# create an Orchestrator database reference, \n", + "# create an Orchestrator database reference,\n", "# generate it's output directory, and launch it locally\n", "db = exp.create_database(port=6780, interface=\"lo\")\n", "exp.generate(db, overwrite=True)\n", @@ -171,15 +171,7 @@ "execution_count": 4, "id": "537a1489-b4c3-4736-a628-b7af433a9cbf", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "22:50:07 e3fbeabfdb3e SmartSim[1216] INFO Working in previously created experiment\n" - ] - } - ], + "outputs": [], "source": [ "# set simulation parameters we can pass as executable arguments\n", "time_steps, seed = 3000, 42\n", @@ -193,7 +185,7 @@ "\n", "# Create the Model reference to our simulation and\n", "# attach needed files to be copied, configured, or symlinked into\n", - "# the Model directory at runtime. \n", + "# the Model directory at runtime.\n", "model = exp.create_model(\"fv_simulation\", settings)\n", "model.attach_generator_files(to_copy=\"fv_sim.py\")\n", "exp.generate(model, overwrite=True)" @@ -209,30 +201,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "22:50:07 e3fbeabfdb3e SmartSim[1216] INFO \n", + "19:49:59 C02YR4ANLVCJ SmartSim[54122] INFO \n", "\n", "=== Launch Summary ===\n", "Experiment: finite_volume_simulation\n", - "Experiment Path: /home/craylabs/tutorials/online_analysis/lattice/finite_volume_simulation\n", + "Experiment Path: /Users/arigazzi/Documents/DeepLearning/smartsim-dev/SmartSim/tutorials/online_analysis/lattice/finite_volume_simulation\n", "Launcher: local\n", "Models: 1\n", "Database Status: active\n", "\n", "=== Models ===\n", "fv_simulation\n", - "Executable: /usr/bin/python\n", + "Executable: /usr/local/anaconda3/envs/ss-py3.10/bin/python\n", "Executable Arguments: fv_sim.py --seed=42 --steps=3000\n", "\n", "\n", "\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] } ], "source": [ @@ -264,104 +249,69 @@ "id": "5aac0fa2-88a4-4c70-a187-764c0c97e255", "metadata": {}, "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", "text": [ - "Vorticity plot at timestep: 0\n", - "\n" + "SmartRedis Library@19-49-59:WARNING: Environment variable SR_LOG_FILE is not set. Defaulting to stdout\n", + "SmartRedis Library@19-49-59:WARNING: Environment variable SR_LOG_LEVEL is not set. Defaulting to INFO\n" ] }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Vorticity plot at timestep: 700\n", - "\n" - ] - }, { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAvsAAADNCAYAAAAmPb/XAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAxOAAAMTgF/d4wjAABbrUlEQVR4nO2dS68lSZLX7Z77qKysR1fXVBVDzYMWSCOBRrNCiB0sESvEGiEh8REQH4AlX2I2LNig3jIr0DBIzTA0LdRqhqYolaqqa6qrsrOysvJxM/PmPSw8rY5du2bm5o+I8IhjP+noRHh4eHg8zom/m5ubn+z3ewiCIAiCIAiCYHvslq5AEARBEARBEATTEGI/CIIgCIIgCDZKiP0gCIIgCIIg2Cgh9oMgCIIgCIJgo4TYD4IgCIIgCIKNEmI/CIIgCIIgCDbKWekOr7322v7999+foi5BEARBEARBEBTyq1/96vl+v39N2lYs9t9//334/LPPsvn2cAInsP9++RjB8y9Bum7WekleXKf3g6635uV18qzPgVWH2usWyHifizUg3e+1ncPquL5eugYyu+gE58T/4Xhs9f9pre/hud9/J7vd19q2YrEf+PDeWOnB5WnWekne0rJq0R5s6di1DYMSAU+34zZp/1xdgzzW88zvwdpeTGur7+KMKtxryJ1LNAaCAVijUUVCMxav9T08wj2ZVOyv9caU0HoTR26p5hoGuXP3nJcmwul6Tf1q1oP+lDQu19wLsGm2JNqn4ggbA/w3Gv+nY6AZVUp64Ue4lyPUwYNmwBqt/pOI/dFOckqO6VynBH8w/AUS7jXbRrp/Iwr9EevUhaWE/EgNiDmEuHS+G2sAjCx0gtv/Ydb6HP93Wm/82p4dyQia82JYgm5if203KJgeyeefb9fWS/N6/PCDoIQ1vngAYF4hPZJor6XmHHoIdTzuxkQ/wIp/Oxti9Hugve8tF+DRzwlA773mGmVu8R8++0E1tW4zLd2KliUg3HTWBd7LpS0eq2Mqgb22cmvpKdR7lHt9vVnBT4n/4/mwxk6N8n9b+n6n6WtilAZKN7E/ygkFy9BrEHH43W+btf1PzPpinFIU9yx7NPFeSmn9vUKcllsq3jcq+Cnh5rMMuXfmkv/JW3y/e4JQLNHgahb7a7wZQX+mDtMZPvvrRPpTG8WyNCtTCOSWMlvrs3bBD1An4r3711j+j0DwI9T1Mv6/p8WKUsfzLMlWffit7XO69VSJ/bVd/GAaNIFP3TNqffb5eo3PfrAM2mDro6CXCF5KyPeo/wgNAY9o9tTTK+hLjnckgj7HUf4/zMRaAh9QLB/+NYt/Sk7803WvUdRLldiP7rgAwB6IwvOVWvo9eXFd89kP5mFUS1EVOUE2t5Au3Wfq/L32HRHpnnut+jyfp5FgHe+IGgRbEnOjYL2PW96/SzNKPWrxPN+lRlEvMUA3mAVv15anFbv2ATtbYrWinsPFGhddJcJ2JFE/p/jvWUYOj0tM6b4l7jreRmFO9B+RqM+xmf+Swcm9f3N5g77M1cANN56gC62+Z60xgMMyNB2beQm3ivApBLYnb688Jfla95maOeLWWw09bVupxV8T/Edm4dfcNoI+aD3ro0ZEk4x+AOt/NnLBSXIeENq4SJpfIyz7wWYY7Q9rzaz+Wq7Nuj636C/N22O/nvR0ufGUp5XhTafbvYI/WL24C+pY/funkpaelVzeEPtBF0pda3KWhh6M5m84Kqv7Y51KpPYQ061lzFHH2rxTliFRGu1G28czkLZGcO92uuW+tKEQFv5brO5/aWCs8XVralCtpZ5ePNZ6KeiJtm4RYj+YBUnc9xp4oh1ra38MvVjVS3Rq67OUb0nBPrfYb9mnd1keC3zNPlxUe/eV9ssJdC1NOl4QLEBO3K9J/K8d73XWBH3JfQqxH1TRwy+/J1ML2J5/gL3KmjNG72RMKeZLRXVJ/lrBPrXQL8nXe99Wcsf2uO6UWMenEOHc2m81GHjDwOpdODJXHyvCVwjRcjwWY55/JI4ltHYuqqFnm0aI/aAKbTDJKGhxhqf8o5CiBmnXZQp3peGZyuLcS1xreWvz9dhWkqck39Rl9EZymeHbAeoG05a44mhRmiwBT7dzSgT/kaL9b4cFuh0uorVIeN5Bo6V4Q21L9QvKCLEfVFPip7/Ej1SzDmkz2En7tYx+P9o/pt4DXWuEstdyn4u24xH6NeLfs700X+s+Sx+jxEffcsEpGUxbI6xpw0MS/VojoaSBoPVQREMgqKCXxbh3aM6py18zXn9+XM4RYj/owsjCtqX3YfTuzUVZWtTXplnrvQW/d3tpvqn2X8MxLSu/tk0bTFuaZi0D2EI+xHo1nv/dsPT7KBGRSzNSXeampCEUbjxBV9YudLVZd9NQ3u37A1bT27JcagmvEduWgO+VV6tLrm4ljOBSM1cdLOu8lG8O63dJ44HWyWoceBoCYd13YYnB2m1bZM0++1bEmWO7jy2E2A/cWH+cI6NNIiL9gYx+Lt2pFXK1luw5rPE123pY9zVGEOy1LFX3FlE7lSDOhdrULP2W+1CN4A+K/6ePPQJNSejNka5FrsHmzRuE2A86oA2eGp1NRLPJMceAzRIrfk+rfMtybT4r7RiY67w9A3JLXXlyeSR/+5z7jpVXagj0cOmJRkCWsAL7GdlPXmqYeClpxKzx2Sitc4j9oIg1ieJcXH9k2HOaSlj1skrPZZGXlrV9PeXXuOa8fHk7rQenp9OU28JU51paB+valLryeKzmPUS01RDggl86bgj5LFZYzp5hjdcm/rzUDPyc61pY0XlK39MljZg13uvSOofYD1wMK4jXzhSCvrePvba9xUpfu9y6XaojF7d7x7Nu+Wt7ODmRj71GPNerhNy1oY0Ay60ml5bLI+XNCfacRV/aJwhmptWH3xOmUzuut35BX0Lsv2JNI9SXYI3++lrdJnXbmUJoz3msuUW9lVa6DeCmQOQi1GPB92wDqJ/gybNtFJZ0VbKuz9VV+j4R/pOkHoFaS79nmwW36Oe2l0TwiYZCM1b8fivf2rVAa1QXTznefXKUXmveK7Gl+9bKUYv9Y7/5PSiJOz8Hs/rf93KHmTv/HK45PUS+lUcT9V43IQnPNbQswHQ7z2PFhdf2r2FJgS5Rck1z+/F89L5T4U+fDRT+lhU/Z/nPiXYpj2XRLyUE/6SM8N4K8rS827Weil6Tg62Boxb73hlWcw/K1lnTn6FUV6/vvkiN9bfFYuwRR15BN4UVv6eIl/ZBoYZCrsU9p1X4lggpy33Eqou3EcDpIepHaRhw0W3ly+Xh26+ubvcAcOu/dHxPb0COnP8+J8R7Fdb/ufROX9P7bA4sH/05tI52fC1yXo0wj8m7jlzs5ziG1t4xUPznXirivSKzpnHg2d6ap0Tga9tqxH6JsPcK/BYfeMn9A0AW7R4h6BGxPQX3KOK9Fo81P+cqJV1zfL4k678l+lst9LkGg1V+CP9qQszXs+nIdEfOUYl97yhzz8N+TI2A3Lmu+s+hxkUlt62kTCvNSu+Zx1tfr1U/J+K9jQBahqeeObh44uuS+wcep0V4zS3o1yL6Pde0t+j1WP6tepQ0AKS4+1p5tUSjIEvJoNAthWb00suHv8fxc3MAeOuk9QpYPRhb9+8/KrGvUSNWt/gweNmUuOfpNVZljzj2rFv1y20rzec5du68JVHeQ9h7r5+HHpMU5RoJreXT/VvzjCT0PW45nv1oPq+lX9vHsvwDJOFPrf40vyT4+bEly712XH4eteI/BL+bVb+7AgCYR3ttUduF2GdYIaesOO1bejhG+0Ns7lq0hC39xg/NY4lVbVsuLVenkvMowXJxkcInljZ26LZS6z5fltYtqNiRhI8V3YRadqlF3yvyKSV1Lu3N0e5fSehLT/1KhaNmGfeEzuyJNYCWowlkmlbjEiYN7KXl5kS/VsecoA/BX8Vo77olWNKqrVncrfp54L0CUqhQnn/rHJ3Ylx7sVrb2oIw2oKnZ515a10Q+hvXTttP1WncUqU659N6U9CBYYh6gzOdeE/c1wionGlH8aN9eka+l5SjtpZGugSbke/QAeff3nLsnjzd+vnV877badCrGqa8/Duq1LPCSsJ9TgIfgb+YYe/mXFL7asSzx70Fz46Hb13zPatic2Jf88qU8tRzjAJYek2TMjmWh93xQ9NPvly+TALDEfovQn5KaQaJWo8kr3nv43GtWe2mQpSTsNZHfasUH8F+73MRdnmtT0xvkydMi5iW3GU+50raSBp923Fx9StJxG94rWj/LjYfXhdanxsJPy/C6RoXob8aaqTeYhhb/fG2db5P89I8hFOfmxL50g3oK9F7dTGti+D+4nLDH5aurm99S2osX6eXO0+k+mtjHxoBUrxZyL27JjUKzereIWG8DRlvW8ueEpMTJyeEcJUHvEfmlFnyP2K65RrltJWlSPTxYvSSasOV1aI0oo7lX5eLma/XhZXos/LiNWvAxbb+/6dMvHTtn7Zfg+/Ww0IeVf1K2/s4fhZrQn5bes8qZe2Dy3GxO7FOmvmHDi+BKljiv1m677y3wALet87j+/Pnhm4r7ly/T+tXVQexfXt7ezkU+Lx+XPXgFrpSPi3vNQu31Y7eosfpb+0lwUY7L3GVit7sp2Lmgl5YtgV/b8JF6MABuuzNpy17R73Xr6dGofPlSvx7S85ZzgeF1y11rq0FB68GvSa34t3oiLIs9PoM0Vr9mtZfOwRLhvQV6WPm7Epb+acjF2afp2rpGzk0ol29LbE7saz5aU7FFtx7NZ3+qcy0ul4rr3U63ul9epm0o8nH98jK9rF++PAj/J08O266uAJ4+TcIK98UPd+Wx0IQ7F6w8b87XnJdRs79mRS2hZGCodAzJmqyJd7Tk8+183XLRqWnceF2TtG8tTSpL209bz6V78DQ6eR5eZz72QWsIlFivJZcYacbcVvGvbbPcdKjg1+rOoeerLU9FWPmDgcmF3uTL0jqFNxKs8OpbdtvhrF7sS63COYU+/d4q2g9wETQxJbnaUAH//PlhHQX+8+eH5W+/TVb8x4/TSxxFPy1TEvrUwpdzG8FtmnjNuaZIAj/nvqI1CnpY/T1Ix5EaFpJwz4l8zYJfY70HkIV265wBVhlafs+6llaKdu8lwW5Z8HHZG7deEtPW8fkxJeEPII/joHXm9ebb6PG4IOffeDzLBWoKa31NeSH4u7M1I9+I9NAeuYiK0sy8WxT/qxf72iQMc/wQ+TG28nBY/m5TXFfpOt46juQqQ11yqKh/9CitP3x4U+RfXqbPkydp2zffAHz3XRL4332X8qL7DhX5ALIVj7qScKGO3ycnAOfnafnsLH14GsDtdCpyqaD1NBC4yNeENK0rlk/PjeKxAPNz5+laneg14OdpiXurASOtWwLaI+61Zdwv1yDIpdHjI1O57SBWD0/OPcyThq5uVvz6FrjFnx+Loln9aVnSdq3xQQW/VT+pkeClp0gPwT8ZW9UCS+N167GwJtWi28Nnf2VocVUDHe8gmN7X03WPLMsptdKjK87lZRLy+I3i/9mz5JqD6ffupc9vfnMQ++jekxP4AOmleXqaPijUcTsVPijw8XNxkdLx++zs5jqWSUX7+fmhPKkRgCJZsoajL7bH1x3XAfSJhug1oFgiQjs+/+REvuX+VCOi6Lck8nNCXxP4HrHfMktwT4u+JroloS5Zy6UGldZbQMsCuB3KEvPxOnEre67+NGQmRXP5oXWXjqu5IFGXnjlpEe0h+IPB8Pjsl1Cz7xbFPWfVYn8kMb/mh8U7WMVlgS9AK+8E9rpgQos7WuwfPDhY7C8vk6j/5pu07f79JOI/+STl+3//D+Drr9P2OblzJ4l5/D47S8v8++QE4LXXDgKfNiYADg0DSRjzdS2vJv4BZIs//5aEn9Zz4BHv/Pzot9STQb/5MqJZzemzRMW9Jeq5sM+Jeq/7j3fZSkNqJq+yekE0iz0dxCv1AFjPDheZ2JiQBrvmxKhmaed5pF4RyeXHexxpO0D52JWlCcE/OTVRZI6VnM9+aYTF0olRpcm2wo0nCF4xyw+BCyDqtoNi/9tvk8B/8iR9nj8H+PLLZM3/4otkxf/FLwC++ip9loA2RlDwX18fBA4VtwApfb9P6TiQmAo3FPIYCpD7qQMchBT2CnC4C4JkXdVcDzxjBiQB7xX5vKeCl+9FazDmhLtH5NcI/BLB32O2XBqhCuEDWyWRbYlBuo0+Q9KEU9I37odpNJwlr4uUH7fT42s9DDQPkmsASOdjHUOy+M9BCPbgCJh6vOAxRVhatdgfaabXtbcEPZOR8fweSn6g31v0JZGP/vj376dv6qrz8GGy3N+/n77/x/8A+NWvAD791HXc2cABwQAAb7+dRP9+f1v8o0Cnafv9TUGkiXxsJADcFlVafHBpnQptPKZlpZdE/W532y2J9jrkIuxIdeTkBLVkxdcEO3fLoXn4zMpSI4DuJ9UF98O68HPQ1rW0UqT7TRt7SG5Qt/TNXWesfaWGJLX0e8/FY3XX8tDz9fj8a70HeA60gcDLb0UT9uHOMzyS1XgURtQsmm/9VD77Wq/BFlm12AeYP9TmVpl6MrJiqMjig2ufPk0W/adPk7j/9a/T55NPklX/L/5imTqXQEWoR5AA5CPNoIDmywA30y03G08eaXtO5FsuSDnrPbeeSst0nYp3gNsRlLjg1xoDADdFfqmPfs7inzsfLS23zWOVl9b5/p6JubgI9lr7eR0sS79U39y55ES3VGcE6yCdx2iMWq/gFluP9jIq3lCcW78fqxL7Xh+toA1vrNtcGd/73yvbbqBZ9NEn//795K7z4EES+V9+mQT/r3+dLPn//b/LbgsjgoN1z88P39SX//Q0pdGIPSigAW5H7MEBwiigqdDmopv3HtBeAMv1JifgeV7Jeq81KhDJqqu5rVjWedyPi3rM++KFnoeXpVn6JZHf6sYjrZfOhKuhRVeSGo/8W7Lac9HNv6ml3ju4l9fDsq5LjQxJwNdY/3lPhWa1H0Vo19ZjlPoHizCiuLXGD9ZqEY7WQ8ANx/TYW2gMrErshwV/fuYItSmKIXR5efTo4K7z9GkaXHv//sEP/6OPAH7+c8cx58G8WicnAG+9dRise35+8/vsLA3QpdF7NJEN0C7ycxZ8PA4dKKyJf0ns8+MA3FwuiWKiuctwdx3JOs+t+NzSz2dCzol8nhdgGj/9ngM/WxsN1OoOYLu0UFHM97Gs/Dl6Wvb5OXh6PEZmjXU+UkLD1LHkdVu70AdYmdgHuN3aGok1jcAv9dGvKZ9z63iSJf/6+hAj/9GjFB7zyZMURefxY4DPP09RdX7849QIGI1/9s/S9+PHqc4Yux+t9SisUQShuL97N+WhYl+KsAMgf/O8vBeAfzTrvWWl59+0oaAdx5rRlmNZyCXhzsU7wG3hbjUCJBcdqzGRE/ceod8SclPCI/L4drS6A9y8P5YYp8tUwOfqgYJfamxwa7xWby0+fy/LPs9Hz90aQKyVmeux8JBrzHj2CYbDY0nmVuQSq7K1b85vfQTrtddy36JZNOs93b5FVif2RxP4nLU8KNZDPsU1vvXj5UKfzmj7/Hly0Xn8OLntPHqUBD667Pzf/6sK/aWv/sm/+3di+v5f/su0gGMQMIQoF8vcqs+t7gC6JZ6mcTGvudNojQqpwWC57dAPwKFMrG8OSehrAh/AdsXRGgJWHr6druNyq88+1pefM1+W1i1K8mqWd8vHnu5HBX8uao1m4af1LhGnluDuadnnZUp1L0FqbFnH1QjBH3RmtImkpnDb4VgDfUe4BlMxvNgfXdxT1vqgaH5yuZa+9wd4qwxJ2GAozcvLJPJ/85sUI/+rr5Lg/8lPAD7+GOCXv3Sd03D843+czhHHIOCsvldXSbhSQSiJaYDbPtPcNUYS/dySr1n8qeinFnzcR3Lj0Roi9FsiJ4rpwGVN5Gt5aiz5mnDnlvwagS+Je0vkt4bcrBGl3ILOt0nWdGqx1yzuvAGQayBo35L/PO7Hj8PPSWsI8edTqz+tJ51ngKNNNFfyu+hFCP5hyVmVtX16HXuKcqfEirs/l0eCVJc1MrzYX5Of/gjdYD2wuhqRkvNU7x+16GO0HQyl+dlnSfD/5V+mwbh/9mfJwi+WPz4n//Sffr+8/4//MZ0rzvp7dZXODcU/gCwU6DK3nGsW9ly8e8n9xjuZF68nrytFE12SoJbEuSbiLUFfkkdal6z+0jL9rhH5PX31+czHtHxJlHtdVjQBn0PKp5VvfWv7a8fIWfY1q36JUO5lsS8hhPyq8YaPtCL31Eb1GdFtJ4ck7qfUgzWNsbUwvNgHGNtPn7OGHxCA7BunNaxapq0290dxi5NhPX6chP69eylO/uefpzCan356Q+iPf3VtTv7RP7qxvv/P/zmdM4YYpWISQH65U7EtWfxz1n7J318agAtwGNTLGxKS2Mm5o0iCWXKjkSz1AAeh7m0I0Lw5/35c9rjr1Ij8EoGvNY48UFcbgNthJLH8nKj2CkvNvaa3MOUhMQF0Kz/Cr6PmrqNtyyH9Bnqds3b9PNc1GgWrw3r3WiHGtcaAd76ckcS/pEWmFvd4DFxe+hpMxbBif3RRr7GWByU3RTVNl6z6mqU/a7mgQgz983H22/v3k6vOxx8D/Nf/mqz6f/VXbSe6Bn70I4B33z1cC+ztQPceLv4BblsRqaCnafyTG5iL23gZ9JiWtdRyn9As55o4r/HL1wboauXQvLROPL1E6JdY8nuKfFoGvUcen/MWYagNvp5C9PJz0US/dvzcdgttDMrJiSzyrd9N67UJMb85LIE+la4Y3WefMrXbjhVUZAsMJ/bXKvIBtvNg5Pz0m7m+vjkg9+nTZLl/8CBZ9D/6COB//s+0TtjG1b3NyY9+BAAA+88/T2L/2bN0TZ49S4KXuvdQJDEhWfMB8v72litQibuGR+QD+FxyvBZ/XrbWKyDVwyv0NSu9NWEWPQY9b2QKoe/Fc2+XFpNaL4HUeNFi+SMlFnKPkM5Z8D0N5JLfV3AUcMG/JjfmqZlD/HPmbnxNyXBif00uO5wRusG81Ax0kfIX3S+MQEP91Z88Sdb7L78E+N//G+BnPxOF/lHw4Yfpm45fwEHLKPilQZSSD73mimNZ70vFPf3m2zRLe0mce69LjlaWN5wmLY+nTynAMQQrojWWvEgWZ4pmmcZ1adIs/lxI7mO5fBxpMCtFE/lc8PNzpL8NTfh7XJUkCzzvMUOsSeMkSt1vrIZHWPc3BX9/WjO/lrjbWrpkdM0SsfX7MZzYp0w56jqoo+leoIhC4YpuKw8epKg7n30G8MknSfjDdi35Gifsxb1/8OAgHl68SB9umZZEvsdiL+3Hy6Rwa7gkTCWrOU3n4juXRyofwLbo4zdvFGl5sTwpfQksN6jcPpSc0OffufkQtGXaMNDqotWpBq+LkmTpl/b3gOXz8/RY//l+0nqOEPUB5Af0tqxj2tICdwStt/Q1mIqhxP4IN7qFNT0kOV89zXqvlaFNMX1DgKHIR8v1l18mkf+//lcKqflf/stx+Og7eXrxA3j9d99OFn68ZjRGPyIJ91wcfEmISELTWpYs5iWW+JI8WoNBs9oD2A0CvtwL2gjCZXpdqQB9+VIXwB5xp+3LRa50jzWRrz0jNN0KAauVIR2H182LJPgRy5+/BKkBzD+5RjXuJ5VnnU8J0RA4Gnq69PAwlpi2NLk6LOm6IzHCNfMyjNgPoT8OtUJfK+OWIEM//QcP0qDcL79MVv1XQn87V7KNu3dP4MmTPbx+9+7hhY49IngtJUHmtejTbwnNgs/zaG4vOUu8lIeXQY8BIEe74cu0LK3OnJZwlwA3XXIkwY/L9Phc+PPyvGgx3qV1r/iWnhXcT3IHahHvOSRBO5fIpecLIF/XENtBMAujjWEYLZqRxTBif4nBFz0Z/UZztJHnpf74Uv7v07iFFgfk3r+fhP7Pf57cdv7sz1LkneAWv/kNwBtvnMOdO+fw+rt3DnMSYE8JFZI5yz7mo98AtlCXlnMfAN1aL5VBJ8iSjoPwibSk+tWCYp1a47VvDT4xmobUgOohGC13EctNxxL4tHFQ0jDQGhVSPSX4tfYKfu/gXV4Pfg45y73XVa6mMTBX4yYaKatiqhlkRxKrI+q+uSc/m4phxD5AWXzYoB3vmIiq+yEJNxSqGH3n/v3kxvPxx+RYAeX3fi9dkfv393Dx9jmcXsBB6APoYp+KfoCbA3IpkqDVkBoFVj4AOeJMqWWdNgh6kDtPns+6PjQNBSV1z8EGBCUn3LS65cQZ305Fr8ca7xX6JftZdZxLbOYaGp56cNGujVWY8pzCZSeYkBHE6ojjNEdoBPVgUbHfcwKnJVnjg2DF2ffs54mz/z0o8jG6zNdfJ5H/0UcAP/1p2h6YPHqU3vUXF+fw1t3dTXcexBD7ezhJWVl22J3e7InhSC46lpWeW+t5Hqlc7pozhS89Ylnosc7U3UYTV5qrjrTOy7d8yHnjwOPO47Fa82UrzePiY31LjQyrTtK6RI3F2+rJwGWeTiNY8d+TNo8FL7/Gol9yXkGwYSwdsoRG7DlWYimGsuyvla20/CQ8/vu38nIhSCPv4Cy59++nWWMfPny1b2Dx+79/sPC/hFM4PYObIpoLDSLyAW5raEvzqqKeftPtlkDXfPSRnhZ7iuRDn7Pon54erPLUpYeXAaAvS+uYVjNQNIfHUu0V+7UiXyqDlqPVsxTPuZYcXxP9fF++nYcmlb5LCXEfBCprMgJLE5OOwKJif80x9TmjC34trr7XbUq7V6KfPoKuOxhP/+HD9Pnss8PkWccYT78BHNt8fn4Kr11cyCr+1Qet+W6hL1ns6bLmny/tD6C75vS23FPRDXBTtEuuOB6oFb504C0X/jytF7VCH6DcxUdKy5WXq09pWi5Pi9vSbnc7yhAf98L99HPla2k9sXqeavYLjo4Sg95Uxx2dUgE/ohYczo1nrYx4cyncVUdqnJQOyDVBiz6NqX//fvr86lcAX3yRXHsuLlL8+NZoKEfC2dlBh56dncLu7BROrpl1nAl9Tlbzlgx+ndLlhoICHuC2uEZoOhf8HqQLw8Njlookra6lWMftZfGuEa5auVIdWoR+rizvAGTt2xqXkBP6kstOrQgPgoWYW8OM5qpjwQc1W0gNgxGMwbOK/dFuYCtL37wWcn76Jed2476ixRPFPsaHv7xMfvpffAHwf/5P8tu/vAT4rd8C+Af/AODf//um8zkWzs5uBuE5P38l+nc35zoo0euitT5n2Zf290Kt4qenh8HGuI1Xmubn7jmYV7K6a245JVZ+Wi6APEsrrw/Cz6EFa/+WePt83ZNHm7Art19NWq4872RgmkjHMqhrDh3ULgl9/qHHmlLMe8uOBkVQyGhaZk0NAY7WMGitd2vDYRHL/lYi7izV/TU16gRZBHVALvfTRzeehw8PYv+TT5Lv/gcfwMmnn4bQL+Cdd9J1v3dvfyOcfNIZN59BS4N/r3Ok36Hkq5+Di+KSfFTAS5Wkwlxyz+F5JHivgFQPqcEgLePxtGMgU/joW+XWWMStdat3gNejp9U+V69cL4X3W5ocjH8soU+Pa/WATCW+a8uNxkDAGMHyTNmCPvQghTvP9QrU3qdZxb7kShLMg+az7923CO7C8/RpEvz37iXB/1d/BSeXl2VlBjd4773DgF1Jj1i6+5YG0cS9Jfh7uO9wi71HbHN3Hm0wLLfi07CYAHJIzNq60zQ8fg+8PQ/e9JrGgOWm49m/Z31KXJEsi742a7BX6NNjWY2jFrzXNbdfEDix3vM1AvNY9J0m0Pk6N57iuuVOLa3XMqnY1y7AVhipJZyjpGUotST5vTMtwmjVx5j6v/lNmiX3/v1k4f/hD7+fLTdo4/r6MORBcivm3Er3inupIEmMU3IWeG6xFyuoYOXnQl+rqzYI16onoqWVnIOHFsFvbctZ7QF84r53mrex0WLFx3Seh85FIQl9emxtfS5ru/f6th4nOEp6aJuWMtakFS1XHe+2GkoHDVeJfWvWNak7aE03LrB9+LP3kvp1X10lJfrdd0nkP3qUGgBvvRVivyN4qXHwbs6yD/BKz9B7KfnhS+k5NIt3rmGAFnvuziPtSwWLZOUvibrDywGot/r3suz3suJa273iXiqnt/Wep3nchbzWfE84UV6W1mrOrZecYxAMyghGzNCQebSZkDUizv6RIrn1SD+woh++5Kv/4EGy7H/1VVoHAPh7fw/gl7/scRpHz253GN96dXXQE95oiuqg3JIKaGKcDo6Vtktga8U6jrYfWui5RV6y7uN2mg/hx5HmArDOYS6xj9ROvJU7Xg8hW2K5B6iLFOSx4ue+UeQDpFHvNF1qGLQ2yLRzy20rSavJExw93lDclutKiPN5mcyyHyRQDI82uEWi1mff3U3EBSOK/sePkzX/8WOA62s4+clPAH7yk6pzCG7z7rvpvnz11e176vXCKCZnvacV0OLd5/ah20us9V6f/Brrv3QcidaL7BHvFO9A4NqegCks+d6QnSUiH8utmfmX7odpXtFt1aeUlt6aIJiJKd1TgmmoFvvHerMlcT+60Aewffa1+muTcPG07+FC//795K5z714S+++913oagcHz5weNcnGR9C6dFwjZ7V7dvxqLPhfeHjGei3evWe25tV6y1GsNAilcpma911yYrDSP6Kqx8E8pFEvyWts9gj7nJuS1Zltiv3bGX6txoFnxa8puWfas59JL8wSBwtQaZ8uacQ481y8s+xWsQdzPCvXTx8+LF4dBus+fp/XXX1+6ppsGNe319UHkZ/E491v5JIs9h8e7l/bLNRRygt86Px6Jh57XVLT2HIwu+nPWeqTUau9Zbp1ASypDEvR8uZYakR5CP9g4a/CI2BIh9gtZa2x9j4++lBfzZc+bW/WfPk0Dcr/9Ng3QvbwE+Ft/q+MZBRw6vhX9+FHT3NIxNb75lFLRTSvJrfWe4/IIOlIjJLcsnQOvG8DtybK0/FqahpV3CkHWS/TXTNolpZVa80sj8VjbpLKsBkEvS35Y9IMNMJXlPSz68xFi/0joOceBui8V/PjZ7wHOz+Hkxz8G+PGPq48Z2HzwQbq/n36a7g3eCmmuqu/xCG2vRd/7Lc1qm0NqSJSG6sydj4Q0SZYVLchz/DkpPZ5X1Gvll6xb20pmxc1tqwndaZWvUdIbMJdFPwiC4BUh9jOUxjJdC9a55M5TbCxcXR2E/rNnKfIOpr3xRo8qBw5QQ1OPKgxw873+8TT0uLW+1pJvoc1qqx3H65tPy+R5cut0GeOul1j0rYG7c1Myg2+JAPWk5URryQRZ2nKpi4+2f4nV3rOtZtmznkuvzRcEjUR0nvEJsZ8hHtIE/cGq8dmpunz+/CDKfvCDmWt73OAtADgM1G1GE+LWNm+joHRWW82aztOlSbOkfPxcAG5v96TRdKQ1DGeOVkFXKvKl9Fphz/PmRLm1rA3Wze0XQj8ImjnWgC1rIsR+AfyhXZOlX2tlWz0XPF1todOoLhhj//Iypd25A/D22z1PJTA4PU1joQGSvr26SoLfTY0lv9bKLzUcAOyY+dKyVo+c8KZWeMlFR9pHE+9eUd8i/nsKuFpXE0vIA+T9+L0inK57Gg9eUa1tn0vkl2zzpLfmDYLgKAixX8CaxD2nJvRmbtsNqN8I+uujsLlzp7i+QR0vX8qdLS481nptGx6sVmho1nrJOq9Z4nkDwLtO06Tj8npyPOetDRJuZUrLfq+Bubl1a1tLPH5P3XqL+xbrfQj9IAgmIsQ+Y6s++rPx7NkNsX/yr//10jU6Kn7v907gl788WKhpG8zUAR6hb4n6XqJfqwsFLf8ANxsA0rG1unJ42tlZX199LcJPDaWTbkl4/Pm94rNV7AP4RT1fL9nmzV9jtW+x3lu/lRD6QRB0IMQ+g/ueHbP4pwNt3OduDYwLZsGjbc2dvUK/1X1HWs4dQ1qncNGtWeIlSsS91DjI7bMUvQSjRzxLaaXx9+dY52k9XHR6WfJz21ryBkFwlITYz7AV8S/52muz4dLzooNy93Ciz6Ardb3HS2gRNI+Y4h28Qr+nVT9XN8861oOjuebw3gFEO5fcOY4m9gH896RGgNaIeS29Ry9BTaOkp7CvrY+VrhH/sUEQOAixz/BYs3Min+9rTWhl5e1JrV++GoUHoU7hu12ycJ6fA9y9WzgyNOjF+XnlZLE14l4rI7etxHJf44YjpXut79Y26kIk0SX0UWdK3X6s8ysVqa1CtzatpNEyhStOyXnntvXIHwTBUXP0Yl8T3b0EtyaUS0JVzd2DUNXg4IL/9LQsznfQDRzLSnWyqg00tx26TVpGPOK8Ji+vnycNyyxJz23j20vcgkalh5Xf2j5lekuatq2nqA+RHwTBgBy92KdwC3ytdV7C2hfXtW1SHWrOjR9HqqPUODEnxeAvn7OzZNV/550Q+wtxcXFTuxcL/RaXHMutxmOxLxXTtdt65ynJNxe1wtCzX++GQO1+LUJ6SiEfAj9YACu0dnDchNjvRI0Qn/JHqJVde8zsnwaqSrTsv/561XGCdqjAN8W+tnNJqMratDkEjGbVl/JY+TxuRFI+L3NdD8oUgrOn8O5Vdq2FP7d/iPxgMOh8OFI6EuK/DE3XSQZhzD/iNT5KsS/dvNwPxOPiU+KaU+vWY/U+WPW3yFkCXA8xinwaU380S+eRgLcCIH2fnBiiP2fNLxGikrjOCX9K7nmZ0sJe+6yO/Iz3EIslZfRyDyrJt1QvRGvjpdc+QfCKEQXmFrCuq7Rt1PtwdGJ/TRF0NKZ6mLqUi0rz7OgerWGgmuHkpNCbSnPDAbB91z1+7TmR30vseCz62j4Uz/5bEWhTuvzU5u/deMjln7L3YYr9AgAof6ePKsaCYEo2r8h6DrS1/OyXYs7jm1Z9ak4u9h0JenJ2dojGc36evm/dEq+7jpecO4wl9FvcX6ZiZIt9T3r/TufuTZh7vxD0i9HrXS6Vs/R73MvUxsq1XIdejGj8neoebF7s92LNfxCTwt1AuPAPZsfjivz9Bo8VP1dQLpqNRo+ZdrU6tNJSr6kaCqMJxqnq07uHZ4k6jHavVsQSAkzyvQ62y2gif45nbvOKbKq49SNRM9EX99PnaUXQF9vZWYj9heEDdOm3OhlarWCX8nrF7lYE0dy9AFu5bl6WPN9ju9YTsLb3b/P7cEKmDsu9lcG8S4QrH53NKrK1/cG0YA0S8YTi6uarT5ePxQ1iQPitKN5J49hEvJdjO985iWu7Crb4vh1R9G/xOk/B3HMVTVl+L7fxKrE/gq86J34EidIoQL3YwwnA7jR9JjxOkOd0N9G1LxBea/g9Rns0T2jt4NhZytq9hv/QtbBmD49ek71WW/ZHE/ylbixBG9p1vr5GERX3YTEK/O2n+r1MIaSnFufHLP5ro6FOeew5iMZMMCojaJo1u/WEFrxJkxvPaIIfYN0tuF54ZuNtKVsCRcHVVVrGaDDBAhxaXKpPj+c30iL05gx137ov5xif3d6TXZeI6Ny9mzJgUO1zE42E42Vu3SONr1sKzxxDc7H0tVgbzT77I4xij5t+E2uCrl5l02uO2vL6GuDFC4D9/iD6gwW4urq5rk2YRZh7PitPvhLRva94zFufz5Gf71Ixal3rGmF7UvmXLDU6vBMgeymNMHvsUzAEt5na4j26phnByGvVYfTrtwRdB+jOPaAlbug8WJYMfOmh2Eeh//z5jBUMbqIpEUF9aFlbxX9uuyUuPcJ9qsl01ziRbq0Y7VFW7xD12E6tbSyU1MP7MymJQmvtF+J/u7SI/zXpmBFEvocRXKBGY9XReOKGzoOnGxFdd54/Ty/sy8uxLZ+bhrvx7HYAFxfZuPb8ftU0BDQRrwn4mmPUbivJU5IPYBzXnxZ3nJ4T1vaYbNZTBu/EomgNBnqvSq9XaUOBUrNPsA5aRPCaLNRWfcKzY2wmEfsj+vIH04Evsf3+4MoTlv0FoX5VFxeHNKYqqPjQlrX8SIm4Lym3dyPAs90j2JdwF6LkhKElfimSENbOnwtij8Vam6uNb8+J45YJa3P36uREFv7W+XnPS9su0TJxdbBt1jQGcY7BvGu5FiMymWV/Sl/+uOHj8fJlEhrPngE8fQrw6NHSNTpicMAEqqXMJGfUFUtKR7gY9Ah6j5D3iv3WxoZVTuu2kjxTUyMarX1evLC3W9s067pmUdfK6pUubcN1zYXo5Ut/D4A1+Dd3X2rdhYLlaRm4unY9s9RgZY21X8+pmMWNp6cvf9zIMUFxhaL/8jJeVItBLfso+JmZVBP4UloPkd+6LtXDW5fa9JZtLXlz1IjG3P45YWpZrHOWaSn96krvXWiNDGTN6Zeb74/3CGAd8dmjdZPO27oWNRb8lt6NYHkka3domGmJaywzm89+r9bfUn764ZaU0KLx4DcK/QcPFqhckLi6Olj30bIPYKocLv6psOaC2hLmtdtajimta2mleXv3HMxFqUC0BsSWWOF7pOE1l/Lwekp1sxodUqOGinBJkNNlrZHiOQ7SYvEP4T82Hp2wtcG7njqGoXd5ZhP7Pf3446aPB74AUfA/fz6G6DlKqFUf1zPZ6bcmunuIfLqc6zEoaSjUiv8eYw5y20ryeKiJBFOyP99uubcA+KzdUm+AR+RqLje8Dsjpqe4OUyL0rQYAPhvU6t9q8Q///mCr9DT0hvarZ9ZoPNKNyj0II93cGHgsQw3H6Lf/5En6BAtAW1pOhcmFvibyW5ZzjYiaBkPO5YgfK3ec0jQr3bu9B7399Etccvg153ly1vicoC9Z5/eau+HQOnBBL01HoS3z4+YaHVLnWsnAX06I/jEZSa/MzRzayHuMY74PGqsOvTk3IfR16EsWl8OyvxCNFz4n9EtEuKeXoEXsW70Dvaz/Pdx+piZ3XM9kVR7x6bFaa0JUGwjLewi4qG7xW9dcgXKTiHkt/iXWfm/vRqmlPwT/uJRqhjWL1KWNoWu+dnOwqNgv9W+Lm7k8uXtAX6Ih9heCKgBDCVBPH4DbAoiL8BLhz3sIahoNJb0C1nLN+lSDgefEsjxr+TiSf3qLZb7GKs+PycuzXG4sCzgfI4BY1nwLOrhXCutZ4t5TYrkPK/8Y9BC6Whlr0T5rqecxsqjY94TnjIdnLHJ+c+fnKcojjgv9t/92D7/92wD//J/HfZyD/eef3w6FVKACuDj3CH66XCvyPWV5jm8t8/VeLj9Wund7KT399i0Lv5TusczT/Fw8e7fxY0pW8xZhzo9Hz0Oy4EvH0hoalqVfE/2tEXzCyr8sU1i216x/prbyr/naLMEwbjxx49aDJPhPTtKL5uwsfS4uDi+e8N2fEWxl4QdbXriNKAJc1QRtieDXRL4l7Pm3ZL1vaSwgNQOOW/J4t/Wgp3tLbrtnsCxNs3z1tbZozlKPy7TBIYl/q1EhrdO8kujneF19JEu/1Sji52rVVyIE/7J4Z5ilDYOtaJ+5XXjWNPPwCAwj9oN1wl/CKPTPz1Pas2cAX3+9TN2OEnzbS5/CYjzflvW9toyShgJfLnX3KdkmrUvHtfDOwJsL71h63NwsuIg1kDTXA6CJc1znAh3r5RHNmgU/Z/mvQRP9tE78nLWeCurT3zpGIQT/+HgFL823FdE6xwy6uWMGOiH2HcTAXB+7XXqBnZ4CvP56Snv5EuDbb5et11HCrfoZXr7UXVhKRTrPWyvyPaK/1tWnZBs9DjJanP3cbUahzdHCaQLctkDz41iWaY/VOtcAyLnMcKFNy881giThzstGgY6NB61O3oYJd+2hSIKfXy8pn0YI/vmYUh/wstckbtdU12MgxL6DpUeZr4Hz8/QyOz8HeO219Lm4SNuePl22bkcFvuXpwAnhrc9FSW5gdatQp6K/ZN+angRt2RL0JTH/pXXvtlY0Aec9Zon7Dm8gSINO+bG5iPa46PB8kijGY0kNCcVLzeXuk0NrPNT68lNyFv6SNIkQ/PPgGXvYo+y1MbdmWvO1moMQ+0E30G9/t0tiH/Xms2cA/+Sf7OGP/gjg3/yb+EFOwf6rr24PjuBqyIEkamuEvsea72kg4HfPgb+0HrRMKZ/V8PGm90QTjnPAj+mZTZaiWbw9x8T7VeOiQ33nNX98y2Jv1dMS/HwZyQ3eDcG/XnoYBtcqWpc2iK65F2QOQuw7WPohXgPownN2loT+nTsAd+8CvP02wBtvJOv+l18uXcuNc/bq54yWfc2N5/oaYHdTMZ2e3rbkSuK1VOhbDYTevQDS99QhO3PpvdH853P5Ec11RdvucS2heekzZIWf5OXkli3LP7X6e795QwVFsTYOodbCT7GuX020nhDyyzOFNlirD3+ubnRAcu112+LA5rkIsR80Q1+iKPovLpLgv3PnEI7zu++Wrefm4TfCcOM5gT3sdidu0UjzlVj0KaW+/XQcQQ+R7xH4tWJ/jsm0rOgwHE+jwMqjbZN83aW81JoOkBf+NXj88yW/emn/XNjQFku6ti8dtMvr0Vp2aZ5gHeRCX48MrTeKdkzjAt46z7We/9IML/ZHaMWFz74P1JUYkeeddwA+/DBF43n6FODjj5eu4cZBSz711weQHYnhsIjJ6IblEbSayMZlr9uOR+R7GhUlUYG8y/ycefnaPlNREslHE8B8VlfEa4XWrM6eNLqu9QBY0Xl4GfybWvz5gFrJr59a89HSz91ptPEAfNBuTohry5bg7+mqE4J/GubQBmsSt6XXguf3RinysKbrNgdDin0xjnvMpLsKqN/+xUVy5XnjjWThv74G+Pt/fw9/+28D/PEfxz3swf7BA4Dnz9MKF/rOt7sl8D1IIvxGHRUBLuWVrPl8P57mdfXxlEnL43WX8ubSe+G5R1z0UnjYR0QaZEvL0hoEUrqVZuXRZp2l+XgDQLse2KPg6WnhZex2h/08DRdN8PcU1rysEO3BqIxqEF1zb0hPhhD7UneO9eDMZe0f9eEdEXwBYVSeiwuAN99ML6f33kt69OOPAR49Avhv/23Zum6Kq6ub4p5b9QFUtXuy2wG8+g1xoaMJS2/segDbQi8tY37Lgi/l59utb7rcGpPfk94bS+hZvvRaGrewIzxSDECb0PS6oNB1tLZrVn/8lkS21JDAMqS8mpDX2sy7nW9wsnbNvNZ9qYxw51meObXBWnz4S+uSc+Ph67XnOtI1WpIhxD5A3c0cwcXnWNGuPb4E6Uy6aN2/ezetP3y4RI03jCX0cbvypreEvceaLG2/vtYtq5qQ5nH+NQu8JNS9Qr9W5HsE/1xifyrBJvnYA+Rnfe2dxte1yags6752fpZ7E83HxTetF6+b5mdfK8hbCDF/XKxN90gDdLnukxo2PG+uYRDIDCP2Aepby1OFXOIP07GT80+k1v2XLw+uOx98kHToBx8ky/69ezNV+BhAfynNdYer1lu+C6ff73J62jbQVHPN4el8mQt3Lu7puhbVxyvwa+Lz82Vp3XPdes+eqw3mtKzluJ4T2Xw5F2ZTcomh5Wpp2r6aWxGAHYZTc6mRLOmSfzz9YD608EvnTOvT4sLTw7pvpXu3Bz7m1AZrEbE1swfn9ivx6c8d/9gj+Swu9qcKXdXrZobQP1ByLdDQfHZ2Mwznu++mz5Mne3j3XYCPPz6+H10P9vfu3VYnVOw7TZ7pnt7undGKODnxi9bcLLO0fOq+Ix2b55XK4Hk1oW+JfK/Yn3NGXU2c0QmjKLVCMVcHybfeimLD6+4VqLl9LR9/S+jzb76/1sh4+TJfjnV+UxHCfTmWcuMZWaRqEXaWrg8uHzuLiP05bkDtMZZ+QNcA74bDNICDyAdIL6Orq+S7DwDwu7+btv/BHwB89VVE52kCLfoknv6tnhdNCTOFwP2cUURS4UZ9+kuwBLNnX83Vx7Loe0W+9s2XSwbsaufYw82nxCef15PPRssFNb/XfFk6nhVOUxP9kqVfymft6xH9HE2o73Y3ey20833x4ub50bpwdx6rV6GF0co5VpbUCKNPHMXDa/ai1oDL9/HMBeAZR8AbYNr6SPdnccv+1KylVbwmrNY7fQlivH38vPtuemn+6Edp/fPPAXa7Pdy9C3DvXtwbD/t79w6++a/cd/avJshKL/FX90VSTQCutzwPwYnL3M2nsBPBhWSt95YvCXTPJF5SWq/Bu72vDcXjFiO58FBxyyPKYHkeMUhFPy2L1skqszaNr3O/fimGv2aZx225CD747GMDQRLyUiSfJQgxfxyMrmd6TKIllefFI8Jz8fw1/ZhrRIx4bxYV+3O3kD2tLSvkZ+Bnt7sZmefOneSzf+cOwIMHKe1nP0sW/idPlq7tiri6Shfx7Az2Z+ep94RE3sQX/ekObqsRDtumiRYqbKjo8YrPHB63IM2vn69fX5fP1ku/p5qMS0trQbNCa207qRFAxb81qZP1GEmWfilWfIsIzbn6cCu/ZGXPkeu9ssqh7jxSfXPptdSWFw2Cepaad2dEAem5DktpqJy//7FN4LV5y75ECPj+4A+GCwr02b++Prjz/LW/ll6qv//7Seg/erRcvVfHqxBH+7NzuLxMSVRgHV7g6V6cnJ3ZCkWAWvAlS6ZWDN+m9RBwSsYBePHE6teEfmmcf2uZlqvVsRQrZj4X+tK11yzTkvAvEYWa6LcEv9e6X0ru+NrznBP81hgB2jugHU/DOyjbS4j56RhFvAIsL0at4/e27LeSc6/Rov5I+4Ybj8JoNz0oR3Pboelc8KPYv7hI63fvpn3eey+Jit/+bYAvv0Sxj+WO8+MYif2nn96w6D9/nq4hFXB0FlAcN7Hbndx26RFUgObeoIkGtO5jkbwszDM1lsi20r1C3+P6w5fnnnUXrzcfsKvNxioJXkn4e6zc0rcWurKHhT+HdGw8plRvfj64LIl6LJ/2bFlivuU8tR6WYFlG0y8jikqkp/++FKKzxLfe46vvXW/Jy5nzvh2lZT8oR4uHK7k98TSMDPniRVp+9Ci9CP/m30wC5fPPp6//6nkVzmh/dg6PHyfBgRPnImiFpwMGU9oJ7DDE5k5WEVx8np4e5uvSBA0V/FJ51OWHHkNyx+ADJKX6eGdX1c4J01qFvnQMj4+/Vicr3UK6Vvx+SLHzrVlf+T2mjUheX0vI8gafZeGXjqmVl0vzRibSehg0wY+/BRwuIz0LfL+wrG+PUQyWI4r7qa9Licjm6y1hOnPbvGMD6PoShNgPitB+BDxdcud5/fW0/uabSai+9x7Ad98l0X+IzHM7FOQxs//00yT079yBl7tzePo4NZqurm7OfgpwuM4oTPb7tH59ffjOCRAqXtC9xuPGo1mOeT5+DC6OPPvUkNu/VOhLlvzWWXh79IRY97i2x4W7qNCyco2FHkwlmq2647LklgOg95pYx2ml93UIV5/1MZLIH6HRMweSV4Mm8DGvNbh3KSYT+5KF91gejq3gEfbSA07deZDdLln1AQDeeSeJz7/zdwB++MMkXt95B+CnP534hNbIO+/A/s234PFjgKf3k5B//vymhZq6Xux2SaSfnx/W6YdadylcaGtuDBzMZw0N8PQAABwaKXw7kusp4A0Vy+rfC4+fP1+WxHaN/z5t7GkTMmkWdGqxtyLy4Dd+sHzac0SR3GOkc25pGHgs+kjOHUYS/NIxNAt+rgGr1ddCqnOI8jFYWsNY1ua5qQlr3uKKI61PRS+3nZK6Tn0Pw7IfuNFGrWsNOxp7//o6vZDRh//u3RSK8+oK4MMPAR4+BPjkE4D79wHCfz+xf/gQ4O7dJPSfAlxeHiz6kqCiogx9lyXRD3AzeglAuQDl7jx8m+TyQ8uTLKl0oK7maiHVwevyw4/tRbPqSxZ9qa6W0Neusad+9Npwdx3uPoP5pWvIBT/Na7lgSem9rcVSY2IONHcefs0tdyStPE7vwbnB8dFzItGpsMJT5rZJ5zdKw6AXdGDwFPdyUrG/pgsd3Ea6f/ggegbsAtwUomjFvXs3if4PP0zL19fJsn/nDsBf/EVMtoV8B2/Bo1ehSZ8/v23RRyTLPlpdcaAufqPV8Pz85j60HA1JNFLLLbVIapZ+OrhRy4MW61wZWCdJgPJeAL6di1grIlDOjYmWa6WVTM7lQXOpomXnxjngOuaVYtRL+9P8tC5ag0IbOEvPJSfs+XqNNb+msWCFcMXrm/P5n6qREhb/eRlBQC4p6keaUKx0fQQ8/v1T3d9JxP6IFzmohz+gOUs+7sOhohJDcr7xRhL6H3wA8Du/k+Lu37uXLP1w6zka23LRyv6Vyf7l7hyePk3jGR49ShZ9Hn2HwgU7dTHg4gRFPwqYQ9SectEv4XWF0Nw4uIWf5qdgT4F2PCriLZHO3Uw8Vn/acOWCThK6vdyHLOswzcOtz9YEWnQfOrBbs1JLjTy6rt3TVlHqEfq9RT7ATd99aaAvXfcex7oe4cIT5FiDFT+QWVIbdxP7IfDXj2att7ZJoTc1UFDeuXNTJOz3AG+9lUT/D36Q/Pj/5E+Ob7KtX987hSdPTuHyEr7/UIs+gG0B5e461MKPH2rh59u4qw/uz49DwXqh8EVxLYFlX1+nvNRyv9sdfPZpHlzWBDONGOQZM4DHofXXhKt0rpJ4R+GZG8xMGwe8QVPSyMjRKq6tmXC9x9XqoE3cZZVvCX1N4HvqW3ONJDcnzXdfqlvpvSm5LsH0jKJzlvLdX6qR4fHvXyO5MJ093Xq6iP01X+ylGemBnboO1H0D4OC/f+dOEmDvvJNi73/7bRq8+4tfcMG/TV/+L77Yw6NHAF9/nXzznz1Ln6urNHiZugoAyGKfCntMox/qviOJewwpSMX+2dlNISOJLI8PuiYAuZuNlM8a3MvrZDVGqIXfsjzTBotmpZd6GSQk4Z8T/LTOWpk51iwCc3XPCf25ro/Hss9j8mvPXc7FKQhybM3SL/nha+G/t0rvexoDdDughVkqaZF5ZqJbCi3iDq7Tb74f34YvPIzMQ0NDnp+n7/feS4N3/+7fBfjJT9LA3QcPpjm3EfjoozQw+fnzZM23rNQAtwUytcBrlnpu5afing/ktaz/eAyvP7ImdCjokkOfBYCbQplb+fFzfq6LcvzQMvB8uIWWl4HH81rTcR96PTxWXKmxZO2jjdeQ0ISxto8UqalWGGv7eSeLos8aYgl9qzfJe6wcknD3iH76DJY8Gz3qHPRlaZ/9uUT90poDWYMffk+k56vXPW8S+1u/8F60gazSes2NW/oPBkAW/Dly+VBY3rmT1q+vUwz+q6vkv39+nnzWf/hDgP/0n26WzGu3Jv70T/ff++A/epQmFXv48KYVH5Gs6gC2FR8FPMBhIC633NOZdne7dOyzs9uDe60GA61fqfiQrOMeYcwbOlK4Tqt3wXsszCfVFbFcVeh+mshDv3+6n2Thlaz/Eh4/du2+caEv9bjgcqmwps+jtZ/1TJU0XFoE/kgieg11DOZlKiHIibDpy2BpydZ7XSX248bbeKZLXsMgG48PP0caqKs9rNLgUACA115L295/P1n5HzxIaR99lD5r52c/S+5JL16kz/X1zdlwJaENcFuQ8euniXL60dx5qNDHY+M3T9eOhXWh3xKatT9needWfqtsqSeAXi/Mw0N90mPRevDQofw8pG96Tvw4fFla51jhRL2+65aQ5r1DPD/fp7ZhUNpAtOaGkJa1PBae68SxxntYg9TnsOhHg2AaRtM+c/nuj6ZVNB/+0e5PC7lejdLzDTeeickNbG0tZ05qGyjW7Lq4fnFxeBFeXBxEElqc//APU8Se62spNCe9JmP9KSH/6l/t4eXLdD6/+EXyyeex0VFQU7cayZouiXlN4NPxETT9xYubefg+Jem8HgC3J1ySxAd/BhBNzHMxTQf58sG+luiX8kiiny7Tukl18Qp/vDb0ePRa8HV+/iVYQluylFsCX2p0Wnm0xoTWKNS+pR4BTZjn0rx5NKHvdUGSaBX5JYTQn45RBeUUYlwS1CNxzC4+1rwEGiH2O6DdhCmmS57zz8aKs6+ta/BrxC2c6M5zfp6+r6+T6N/v0/of/VGyhv/O7yQf/v/wH25aw+mRtLOZg3/xL/aw36e6fvXVwQ//T//0tmUZIJ3j2Vmab+D0NJ0rFf3czx7g8E0bCJjOhbc1k65mrc+JfLpdi+qDIttyD9HAQayWRZxG4fEKei7epeg/NKJOrncB68cbr4gk/AHkxgNeH94I4Pm9SNfZEvh0WRPvuR4Aq2Egud54hb7mtuMR/Va6N1/r4FkrbGdJI6ClARP0YynB31vQe85h6wJ6ZKSGVsvEYSH2O+B12wEYrzvMg1Vna1Y7aRu38APcdJtAAfn224cQnXfvpvTLyyQ0f/CDlP7FFwB/+ZcpLv8o/PEfy6Lu4uIwczA2bFDkn52lRsHFRTrH8/N0DVCo88G0KK65S47UI0DFpBVqU2oY8EaE1giQ6kjHCgDc3A/AHmxJ3VW0UJ7WIF5MlwbpAhxCb/IGAvYOcDFPhb+UF+tMGyc0PyKl8XPThL3lwiORi1STW7YEO88jiXVrH+041rNRIvJbRTHdVjpo2RLvLcI+hP44LCWAreP2CAoylSW/RvNs1TWnBMt/n862q+XlhNifmDX77E+BFDeWWz53u4OVHy3jV1dp+fQ0DeK9cyf58v/hHwL89KcAf/7n+SPPhSTYMF7+o0dJKL755sF1CSMTPX+ezuvFi3TuKF5RSEviHgUmFfco7F++vDlJErW2S2KcT7SFx0ULOG8wSGIfv/mkXtr4AoBUvmU9xRCVOTHDRT8X89x3X+oR4NZ4Po6A5peWsVzJ6s+fc8naT8/RahSUoolHTegD+AS7t3FgiX2pftJxrWWtDCs9ly/XK9VD1PcihP5xYxkca8T/0tqkpC7H0hhodVsKsT8BkltPywNp+WnN8aBLrezcDzBn4edIome3S7H3r66SCL68TML4rbfS92/9Vvr88IcAf/AHafDu55+nz8hcXR0ENFqZ0VJMxSENR0mFNIr78/Obs55i+vW1PHMqFeo0LxXGALKPuWTR55F78Dykdb4/gGwZtsQcheejdafPkmbt5+vUb583Dug10fLyPNI3vaYAN8ds8Aai5bqTc+vxXje67vXh175LLPnSd4k1P/eMlJy/hOdaWOWvRehHA6GOEazNNZZ8nn/pcwjKaHHhAQixPxtaLH6k1hd+FHrUV7J4ok87Lj97dli/cydF6XnttST4P/wwhbD88z8H+PTTsdx7NCRhSEUqFYd81mHaMKLWbClGPC9LSwOQw1lKFm5uwZe+cT8q+nGbNPi41CrKreXadsnaz/eXLPn8vvAy6aBTj+in94RfWwDZZ5+WIW3zoFmtEa8Pv/RdYsn37J/Lr51Tq2U/d020RumSjFKPY2MJwS+9X0uFvrQcbIPcsxBifwK26MNvTZ5Vky5xAnvY7Q75qYhDt56zsyRGLy8BvvsO4Jtvkn//u+8C/PW/nhoDOAvvZ58lV5+f/zwNlq0Z6NgT6qv/5ps3ffYxHRs3+NntZPcZLI8LZu63D3B7mX5TtEGkmB/dbaQJvOhgV6wbt+jT+mvbtPJpPSwkiz9vTEj+9Zqwz1nyNWGPjSjuwy/l5cvSunebhMf6nbOceyzwuW+pHM/gW+v+ewU/r7tGSYODPlc8+lRPWhsxvfc9dkb02efkjItTMKeWOcYeipwbT+46hNgPutLLwi9ZMtGijWIR4BCaE4UzRuh59900kPfRo+T28803h8G86DozF7tdEvMXFwdRT793u4PIv7g4COJcGM7crLh0m2S5pcKab+PQhgAdByDl4Q0DLui5WxL/UBckLtb5ddWut+YnT6E9JZiP++LTMjT/fayn1LtBrxfATT9+Wicu/Hl9e/nv566jJcwBynoApLTc5Fi9Rb43ko4nhj/95r+juQmxHljMJYCXMlauxUg6EiH2J6DVZ3/ElqrmK1YT71WDXqvd7mZ4ToC0jINaT08P0W1w8O533yXL/ttvJ8v/w4cp/W/8jZT+D/9hCon54AHA/fvJ2v/gQWoQPHp0KMcaOEm/+aBUTOfWbWqtRzFPI+7Qbz65leTqwn3ec40AgIOYyc2AmxN7tCx6X/g6bRhgnfjx+ceaDEwqQ6s3h58Tv6fSwFqanvPL16z4lu++ZdWXXKukWYG1/Tkl1u4SS7tHmOcmxLJEe4nY58fjeGLka8ew6rSE0G8lGgptLBl6s9QjQDK+9az7iFplq4TP/oBwN54ttkLxQWs5P8++lmUW4CCecTtGmUGL+YsXKc+LF2mw79OnaVbeb75J399+C/D48aFhcHlpD9Ck9eCDUOl2LvipoJcmz9JcdjyTamlCmtaRp1ti2QqLyfNK6xzqo6+FkLy+Prho0UHJ0oe7E0l4BI2WR5v8SvO5p2nUis8bEx6xb1n8rXqWUHpPS0Vwznpfezxt3RLcNc+IJe6lc5can9461BBi/TixQjFKA3Z5mMbguAmxPwM5H/7RW8eaRaH1D0SLKMDLpS9S7s5wfp7SMGLP1VVy4bm+TgIe/fufP09WfVxHKz4NiXl5edj2+HESeldXB59tOngW4La1lYpwXndJkFtCHvel594SCrE0jKGWVgu39nNRTO8t9f+3egToLMC5a2NhNXw0Vxy6rdSCr0Xk4fmldS2tFc/99zYIW4S7J48k7lsafrk6SM9HbrIw/lx66uGpW8tvMhoJbYz0nrb8tefUGKM0Injc+S0SoTc3hKcFvsTDPFJPhfbCQqswuvlcXwO8/noSVnQbin1sIKDYR3cgFPv4jWL/5Utd7EtWf49vsiXGLQHK00rDHlrLU6NZYS0Bu9vdFMjSoFtc5z0HtJHhPU/a8JD2ob7+NL9kyed15GXS8+YzrUpzC1i9XFKZnBrhm3PjKhX0NQJfqodUF2v/0m3Wb47vU3Nd52Tp428BHi0vsK/DXDqhZGKpYybE/syUtD6XFtYakpW/pa6l+/EXLwp3nJwKBflbbx3Wr6+TqKd++Sjm8YN5aF6ezsW+5c9N65g7FystZ9ksjWjirRvisS73sEBLg16pkKb+/1rEHir6c40hvkyR7oHUAOBinR5HisSTWy+5jlNY+BHv81LyXJWkTy3sSwW/91nSrPo8j3Ss3u4+0jGC9WCN/dN6xUeKUrP08bdGzmc/Qm8OxlZ/AHM3SvgLDP28pe1UqOFAWbTWo5hHC/6dO4f16+vkKoLuPJgOcBD9NN69JPZ7nV9uW6tgkvAMFPUOOvVeE48Qx4nENLj1G5etfXLH5Nssyz+iiX9ejtZ4kLbn0qV8Fp4yeoj33LZerjm9hL/0/HjmE+B4QnJONcg3RP66sVxzpN72tfjo19RtpEbMUkTozZWzxge3x+DcnkiW/uvrNNkWXb+4uO1/zwU8LtNvatHn++J+9DgAul//lNZYiZIXfs6qXLLsPX/P9ZDEMYbnxOPwXgDuNkOXcYAv1pOKN2+vjCUQtYaAdq60EcIbBhzres3ly1+yHaA8vr33GFNa9Om6FnpUe748rj61bkYeQuRPw8jv6lZ/7lam1gElYwVHvk9LEmJ/MEoG3qxhco+l0NwwqKBD8U57Bajw11x2JHGvidylBlx68biI8DySkNd6N6RytPIoktit7amQzpEKOen4JcJfKztXppTfeha4Tz9F8u+fglIrdElPUkvDolfvA00vmUBM2jd37qX3q6UXJujH1geC1uC9Hr0aBWHpLyfEftDEiDMBe17kkvsEDubFde3bK/BL3FqmttjmXFgkcZ2z3lvn6Y09r6VJ6xbSvaUWdsnyT/ejop9OGJaz1lv1sLZr52b1BmjuPCV4XJl6+Y7XCs+lRX/LLMHcV1+y6E8lyEPoT8+c7znLR9vy4Zf2HYWI8V9PxNkPFmckoS+h+Vpz0Sfl81ixteWceO09eVLNPp6eBul8aqz7uUYSP4bVWCrBI4LorL3Ugl4r+nvUSctX2uNQ20ORY0px2atR5dnHM+BdsuBLPvvSxG88r0RNaFxKCP15mNN1taWXfyQhPPX1Gr2B04sIvRksyuhCn2O5Z2jWYABZqGv+1T0s+h5x6xXAljW3xqJPl1us+rlGAJ3Uih/LStPQfLIlLJcZPG4P1545yPUmzE3NtagVuznLPcUzy6+UZpWTq3ePwbkh9Odly4KyB3NrAu7Pfyziv5QQ+0ET/MekhQQbCe3lyAUcFUl8Hx79B/NJlFjwe4r8HlZ9niZNCOVx8ZnL1cfjpqLR6l9eYk0uzWPl9z4PaxaFtWLfO87Dagh6RL83Uo/k7uOpj8aa7+laCRFpU3JtvBpB8tHP+ey33KMt3uMQ+0F3RgsB5h1XoA2U1FwpNIHAxZfHssrDNEpIoRslPK4mUv09ado14hNNeRoAUl60qHvHP/DlWiEsUWItLhH6vQXanIJvjt6B1gZSrcDnaZa45yKfHsPr8hMEJeR89tcoTi1jIa5reqLn+eau34g9BuGzHyzOCAKfUlOfUqGs7ev1l64RACXCi1u8ufgp6WnwuClpeXpG8sn1Mnjq35Macb824VdS39Lr7C3bcn2pEfVaWoklX8pTKu5Ln5+1PTtbYYQoeCP76FN6TLTJy/CG3iwZ0FoyBmKUax0++0GQobXHITdoMpdXyl8jonq/7CUXGO43z+HWeWmsA1+WLPlSWTmxLzWqeANAytPT+i/RU9xPNeESx7rHtfR4PnPnXzKrbk2atKzNjFvybbnwBEHQxmgGxxEIsR90JdcVNieaqJ+iHqVivUUIlTQ2rH34vt6GCeKZDKq0F0BLl6z4Wl5vRB+vv3+rEK4R7GsSgS3jJCQ8517aoCq15PN1yV2H5vH49Xt6Baxlq67BfIzixrEGSq+T5ZdfEm40Z9DrdZw1EWI/CGamVFS3lOkpO1cfrXEhiTIu/LhwkepR2jtAjyP1CmhuVJbYt/aj1AjbNQn3GqY+v9yz3TK2wpPHE3dfWw5RHgQ2mnjWxPrUYnuUsYa9CbEfTMqSs9yt7QfbUxi0uAnRfTzuSda4BGl/LtSt3gC+XrtN6xkoKcNKOyamPv/WAbotLjyemPue5Zz7j6dcjWhAzIck/NZo1Z2Tmvduzi/fWpdCb2IaX7fQ/PLXMj4iR4j9YHKkH8vahPja8Ljt1JbjaUhY/vHcEmz1BuC6FsHHsv57rP18vgSeT6p/jRvVKPSo51xis0X41op7AD3efuuyVL6VN0T98sR7qpyRXHlrWaurjkWI/WBSvK1lADm0WNCfUhHRo5eA71vbG4BIA4c9VvjangGtHh5//h4Cu1djYsreo1ZK6lZq2feG45T29wpybVkbs1EzkDsaAMuzRSE4FzU+/FpEHc23HtO890nrBeDra/fhD7EfDAH/gQXjUBO9Jmf9LukNsMgNEuauSDU9A7R+Hr9+XjePT3uu0dCrp6YnlptXbXk98npnyLXKKRm8azUGNNcdbz2C8Yj3Uz88Rj1P6E0tj0dPaJ4GtWE6RyXEfjArVsuc5pmS6D1oo3YwcE6w5iydnsG9HOqmg3B3HWl/rTxvPgB5lmVPmRra4OAejYlaPCK15nnQqA3FqR2vxLpe4nKTq2e474zP2iy3IzHnu7U0Dr/GHJN3LUmI/WBWLLceb5ed1F0nrYeoX4aaaEOefXoPMrZ6BbSeh5JeDk+I1VyekvPvaWXvGU6zR7Qebxk17i89o/m0jAHIEY2AeRlx1tS14L1WU7ybrTGB0n3MDeZdMshIT0LsB8PgnbWONwy8/nnSdmu67mgk9KVnI4DvV9sQkI6fs6BzNyGpPlKdrJ6JljJq8uYocVeaA+/9zeXr1RhoFfnecj31C6ZDcxlZu/Abhdx19Pjolxr1vLPzSts0Y+IanocQ+8Gw5Fx+uOBHvD51tb58QV+mcAui+5UKpZJoO1LvAMfqLeDlt7g6lZSl7SNRKzRboz6VUCvyW7Z7B/32aqgE86KJyDUIu5EpNaiVhuUsocQzQNIa0vqohNgPhmGEFrLVMAjhvwy1oS69PQmaRbyn+Mq5xXjDTI5gZffQMkt0j2hPU+UpGfyrldM6diCYn6XfS2vGcqmZ43hSY83rCozrWyDEfjAMXjceaX2OOnnDhfZsGEiWhhzH4IrkFUAe1xmev8SnOjdAWCundlBurvdAOpZESzjVloZWb3r02mhYg2xLxH1uWwj9MZEma+LLQZ6Wa1XzLsvlrXXjyQUWGZ0Q+0HQQOs8Ap4/jNI/lVw9eJ22TGmvgEeQlTYgpH28YxFyA1OxMZCL/gJQ3iDQ6rQGAdpL1FNaQme2jCkJgmNGG2s3t1GrZNDviITYD4KZKBHhc1FSj600DEZy75EoGRxb6+LEybkZeaPhTBXi04NXtEuURgzqIe7Doj824bPfn5p3SC6u/lQ+/N77vJbnYfVify2tqiBYO3PPhzA31CVH86uu9ZmviURk7WftW7OPV+zmGgUtgnsOWsKA9hoPkMsXQj8IxmILOnP1Yp9SOq3xFm5gECxFD1/MEektxKaIdFOLd4xBbn/KWgYNU3pc997jBULkj8Xa3TZGpNc7YyrXnZIeg7WN4diU2C9lDTcoCLaINqZhi0wlkFvKrRnE21JWaZm1zOliNUX+IAgOaC41I70r1qIjVyf2S1tRa7kRQXDseH6rc//JTzULc4sInLKnYMqBuC3hOHsyVR2mDhkaBGtDm5W2ppy52Zp2XJ3Yp3gnZNBu2lq6X4IgSIxs4ZmLKV1paic461H2GlnCHSgI1oI2EZXEaP/lWxuYvWqxj0i++Hw7z6vlWZsfVhAcM1NGE+LRk0Z7GVFaQmlOeZyex56LUecICJZHiwwTWqGd0RsE1rHXcP+HF/vSD6nGlcfzkIwYGjEIgj5YPXze/UcW/CX09Nmf4thb4xjP+VgIrdCfNfzPru2+Dy/2AfqKe62ctd24YycXWSmsLYEX6f8inp16gTq65X5KQtQHx4w0ceRWWPs7YRViH5liCmTu1hNuPOuA35vcehD0YEvW/anYWiMhBHwAYLsDh25I9PaOGOm/VqrLmu71kGLf84Pp+RBojQHeSl3TjQ2CYBrm/G86JkJUB2sipxusuPBaQ4GvH7vmGD1EM7+nIzOE2B+p60f7oY1+I4+NEjeeUpefln2tOgbj0tNi3zo2IAiCdSHpF/xP0SLSRO/0gTX+N67t/gxtS7Hcb4IycrPBjUDO/Urb7jk3njf3XbKvVUfv+azh/gRtnHx/1w+fmn2DIFgHx/Z71d5xt//5bn7WxFr/hxex7M/tplOCJcDWeIORFqvBXINfrTrWbmvJK+3LLTW93cmk6+zp7uXr3q7h6H04sMR1qDlm6T70GZjifzXcDtbN2sTWiMxlrBn996W9a1vrPdIzatVl5PszhBvPWtDE1zHT4uayRkoaErW0/LF59t3aPQls5nJHjGdqnfD/62BctmSALGUUV2+Lkd+tJ/t9WcVOTk6eAcDX01QnCIIgCIIgCIJC3t/v969JG4rFfhAEQRAEQRAE62DoAbpBEARBEARBENQTYj8IgiAIgiAINkqI/SAIgiAIgiDYKCH2gyAIgiAIgmCjhNgPgiAIgiAIgo0SYj8IgiAIgiAINkqI/SAIgiAIgiDYKCH2gyAIgiAIgmCjhNgPgiAIgiAIgo0SYj8IgiAIgiAINsr/B3/yqSOFv+c+AAAAAElFTkSuQmCC", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAvgAAADpCAYAAABRN0P0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAAxOAAAMTgF/d4wjAABu+klEQVR4nO2de5BlV1X/1+3H7Z7unskk83CYDElMAsHwUkL0F0kgFiFQChYgFJQRibwTYiGiPAQJZYHiz6QMJSgIEiU8qn5gWcUPhRiU+hGFCPwiYvIjwCSEJCTDZB49PT093bcf5/fHnnXvuqvX3mfv89zn3PWpunXuee/z/u51vnudTpIkCSiKoiiKoiiK0grG6i6AoiiKoiiKoijFoQJfURRFURRFUVqECnxFURRFURRFaREq8BVFURRFURSlRajAVxRFURRFUZQWoQJfURRFURRFUVqECnxFURRFURRFaREq8BVFaR2f+tSn4IILLvCa9vbbb4e5uTlYX18vuVTDvOc974FLL7200nXm5Q1veAO85jWvqbsYiqIoSgoq8BVFKZ0XvehF8MIXvlAc9/a3vx0uvPDCzMu+/PLL4V3vetfQsKuuugq+973vec1/2WWXweLiIoyPjwNAvMK7qHJdffXV8Bu/8Rup03U6Hfjyl788NOzDH/4wfOxjH8tdhqz4lj0vn/rUp2Bubm7oNzExAU996lP70yRJAtdffz3s3bsXZmdn4ZnPfCbcddddQ8v5zne+A8985jNhdnYW9u7dC+95z3tAvy2pKEoVqMBXFKV0rrnmGvjCF74ADz300NDwXq8HH//4x+Gaa64JXmav1yuqeIoyxFVXXQWLi4v939GjR2Hnzp3wile8oj/NDTfcAB//+Mfh1ltvhUOHDsEznvEMeO5znwuLi4sAAHD8+HF47nOfC894xjPg0KFDcOutt8LHPvYxuOmmm2raKkVRRopEURSlZDY2NpLHPe5xybvf/e6h4Z/61KeS2dnZ5NixY8nJkyeTt771rck555yTbN++Pbn00kuTO+64oz/tzTffnJx55pnJBz/4weTss89O5ubmkte//vXJ2NhYMjk5mczOziazs7ND0yKrq6vJjTfemPzMz/xMMjc3l5x55pnJ+9///iRJkuQrX/lKAgDJ6upq8slPfjKZnJxMxsbG+sv76le/mlx66aXJe97znqGyf/azn012796drKysiNsMAMmNN96YPP3pT09mZ2eTiy++OPnmN7/ZH3/99dcnz3jGM/r9R48eTV772tcmZ555ZrJjx47kec97XnLPPfckSZJYyyXxh3/4h8njHve4ZG5uLtm3b19y3XXXJSdOnEiSJEne9773JRMTE8nExER/OT/60Y82LePCCy9MACCZnp5OZmdnk+c973lJkiTJK1/5yuSqq67qT3f22Wcn119/fXLllVcms7Ozyfnnn5/8y7/8S/KVr3wlefKTn5zMzc0lz372s5NHHnmkP8/JkyeTd7zjHcm5556bbN++PbnsssuSO++8sz/+X//1X5OLLroo2bZtW3LGGWckv/iLv5gcOXLEWfY77rgjedaznpWcccYZyVlnnZW8613vSlZXV72PRRqf+cxnki1btiSHDx/uDzvnnHOSm266qd+/urqa7Ny5M/nEJz6RJEmS/O3f/m2ya9euoXLcdNNNybnnnuu9XkVRlKyowFcUpRJuvPHGZO/evUOC57LLLkte97rXJUmSJNddd13ypCc9KfnBD36QrKysJDfccEMyNzeXPPjgg0mSGNE+Pj6evPa1r02OHz/eF63Petazkne+851D6+IC/53vfGdy3nnnJf/xH/+RrK+vJ4cPH06+9rWvJUkyLPCTZLPwThJTEXnsYx+brK+v94ddccUVydve9jbr9gJAct555yV33313sry8nFx//fXJzp07k/n5eXE9z3/+85PLL788eeSRR5ITJ04kb3rTm5J9+/Ylx48ft5ZL4hOf+ETyox/9KNnY2Ejuuuuu5Lzzzkve/va398dzke4q/2233TY0TBL4Z511VvLtb387WVtbS373d3832bNnT/KiF70oOXjwYLKwsJBccskl/WOMy3j2s5+dPPjgg8nq6mryF3/xF8muXbuSo0ePJkmSJHv37k0+/vGPJxsbG8nKykryta99LVlcXLSW/Z577klmZ2eTz3zmM8nq6mpy//33J095ylOS9773vUPb4joWaVx22WXJ1Vdf3e+fn59PAKB/DiHPec5zkje/+c1JkiTJ7/zO7yRXXnnl0Ph///d/TwAgOXbsmNd6FUVRsqIWHUVRKuG3fuu34OjRo/D5z38eAADuvvtuuP322+Haa6+FjY0N+Ju/+Rt473vfC+effz50u114y1veAueeey588pOfHFrOTTfdBHNzczAzM+O13iRJ4AMf+AC8//3vh5//+Z+HsbExOOOMM+CSSy7xLvtLXvISWF5ehi9+8YsAAHDvvffCV77yFXjd617nnO9Nb3oTXHjhhTA1NQXvfve7YWJior/9lEceeQS+8IUvwE033QR79uyBmZkZ+LM/+zM4efIkfOELX/AuJwDAK17xCjjrrLOg0+nAE5/4RHjjG98I//zP/xy0jBBe85rXwFOf+lQYHx+H3/zN34QDBw7A7//+78OuXbtg69at8Gu/9mvwjW98AwAADh8+DH/3d38HH/rQh2Dfvn0wMTEB1113HZx22mn97ex2u3DvvffCww8/DN1uFy655BKYnZ21rv9DH/oQvOAFL4CXv/zlMDExAWeffTa89a1vhZtvvnloOt9jwbnrrrv65ymysLAAAADbt28fmvb000/vj1tYWBDH0/kVRVHKQgW+oiiVcPrpp8PLXvYy+PCHPwwApsHmJZdcAk996lPh0KFDcPLkSTjvvPOG5jn//PPhgQce6Pfv3r3bW9gjhw4dgsXFRe+sOhLdbhde/epXw0c+8hEAAPjrv/5rePaznw3nnnuuc76f/umf7v8fGxuDs88+Gx588MFN0+Ewuv2Tk5Nw9tlnD22/Dx/5yEfgaU97GuzYsQNOO+00eOc73wkHDx4MWkYIj3nMY/r/UYjzYcePHwcAgP379wMAwC/8wi/A9u3b+78f//jH/fYZn//85+G+++6Diy66CM4//3y4/vrrYW1tzbr+H/zgB/AP//APQ8u75ppr4MCBA0PT+R4Lzl/+5V/C05/+dLj44ov7w7Zt2wYAAPPz80PTHj16tD9u27Zt4ng6v6IoSlmowFcUpTKuvfZa+PKXvwz/9V//Bbfccks/Krpz506Ynp6Ge++9d2j6e++9F84666x+/9jY5luWNIyyc+dOmJubg+9///teZbQt7/Wvfz3ceuutcN9998HNN98Mb3jDG1KXdf/99/f/b2xswAMPPAD79u3bNN1jH/tYAICh7V9bW4MHHnigv/1p2wkA8PWvfx2uu+46uPHGG+HAgQNw7NgxeN/73jeUucVnOQAmi07R7NmzBwBMdpn5+fn+b2lpCd7+9rcDAMCTn/xk+PSnPw0HDhyAz33uc/DhD3+4H42Xyr5nzx749V//9aHlLSws9Bu7Ir7HgnL8+HH45Cc/ORS9BwA47bTT4JxzzoFvfvOb/WFra2vw7W9/G37u534OAAB+9md/Fv7zP/9zqHLyrW99C84991wV+IqilI4KfEVRKuPiiy+Giy66CF784hdDt9uFl770pQBghNurXvUqePe73w333Xcf9Ho9+PM//3PYv38/XHXVVc5l7tmzxyneO50O/PZv/za84x3vgG9961uQJAkcOXIEvv71r1uX98ADD8Dy8vLQ8HPOOQee85znwEtf+lKYnJyEF7zgBanb+4EPfAC++93vQq/Xg/e9733Q6/XgV3/1VzdN95jHPAZ++Zd/Gd7ylrfAT37yEzh58iS87W1vg263C7/yK7/iLBfl2LFjMD4+Drt27YLJyUm488474YMf/OCm7bv33ntT8/7v2bPHO9WoL2effTa88IUvhDe+8Y3wox/9CACMiP7iF78IjzzyCPR6Pbj55pvh0UcfBQAjpMfHx2FiYsJa9muvvRY+97nPwWc/+1no9Xqwvr4O+/fvhy996UtD6/Y9FpRbbrkFJicn4eUvf/mmcddeey3ccMMNcNddd8HJkyfh+uuvh8nJSXjRi14EAAAvfvGLYXx8HK6//no4efIk3HXXXXDDDTfAG9/4xuw7UFEUxRMV+IqiVMq1114L9913H7zqVa+Cqamp/vAbbrgBrrzySvilX/ol2L17N/z93/893Hbbbf3oto23vOUt8L3vfQ9OP/30TZ5n5I/+6I/gNa95DVx11VWwdetWeMpTngK33367OO3LXvYyuOCCC2Dv3r2wfft2+Ld/+7f+uGuuuQbuvPNOePWrX90XnS6uueYaeMUrXgFnnHEGfP7zn4d/+qd/spbxlltugXPOOQee9rSnwb59++Duu++GL3/5y7B169bUciFXXnklvOENb4DLL78cTjvtNPiDP/gDeOUrXzk0DbYb2LlzJ2zfvt1qAfqTP/kT+NM//VPYvn07PP/5z0/dVl8+/elPw0UXXQTPec5zYOvWrXDBBRfARz/60f5bhs997nPwxCc+EWZnZ+FZz3oWXH311f1tkMp+8cUXw2233QYf/ehH4cwzz4QdO3bAS17ykn4FAgk5Fshf/dVfwdVXXw1btmzZNO73fu/34Oqrr4YrrrgCduzYAbfffjt86Utfgrm5OQAA2Lp1K9x6663w1a9+FXbs2AFXXHEFvOpVr4I3v/nNeXehoihKKp0k0a9uKIqi+PDd734XnvSkJ8EPf/jDIeuQRKfTgdtuuw2uuOKKikqn2NBjoSjKqKERfEVRFA96vR788R//MbzkJS9JFfeKoiiKUicq8BVFUVL4x3/8Rzj99NPh7rvvhhtvvLHu4iiKoiiKE7XoKIqiKIqiKEqL0Ai+oiiKoiiKorQIFfiKoiiKoiiK0iJU4CuKoiiKoihKi1CBryiKoiiKoigtQgW+oiiKoiiKorQIFfiKoiiKoiiK0iJU4CuKoiiKoihKi1CBryiKoiiKoigtQgW+oiiKoiiKorQIFfiKoiiKoiiK0iJU4CuKoiiKoihKi1CBryiKoiiKoigtQgW+oiiKoiiKorQIFfiKoiiKoiiK0iJU4CuKoiiKoihKi1CBryiKoiiKoigtQgW+oiiKoiiKorSIibQJpqamYNeuXVWURVEURVEURVGUFB599FFYWVmxjk8V+Lt27YIHH3yo0EJxOpBAAp3+fwAQ+33GpYHzhpSpCfhuv+KGn0tp552tn547TTuXRhXpPuSiimNa5XXdhnPUde3x6zTPsm3PntBnUZZ7Tch9KMt68mwPL2MdhF7HTaKpz5i6jkOZ+6SqbXJdV4997D7nvKkCHwm5cYYIHtdOqvPijPlikQgpb9tueqG4HnB0PI6THnp8eSH9SrXw850fDzo+5NqwPVRd50oorrIWTdp+agKua6/IY8H/S+dCaCDKdR76jgvtD53Xh7rPm7rXXyZZ71WjStZ9ZNMILr2QB59zNsuzwFvgh9w4Q/qlDePCyia0+Dx6wtsZ5X1T1QWoxElZkUXbsso4V0b5+h0VQp5v+rwLoy37K5a3JG3C9z4uVeqLIktQyGc6b4FfFXkrB6NKFrtSk8j7Oluj7orrRh7LNVN3OfS8jwOfN4u2aUPenOex93CKfHM16qTtP92/bqraP2UHc/IuX7PotIQEOkNilv/4NFWWSyqjz7giyqqvNBWkruMfsl5+zVYNvWso9WKzy/jYaPi0tnnTlhdq2anz3JWIrTy+tOH60/tIdoo6b6OL4Cvl4HrzkecizOIdC7F38XGhDyg+DKNYaREsJQ7yHhfp3KjT1x6CFMVV4qSI1/dF+eyLnDaE2O+dMb2p86EtNpy851SWt02h6yzK+hTyhqyKY6sCfwQpyzvGKeIELsqipZacZlD2A7jOyLjPupskQJRwsrYjS7PKcCEUMi1AMRbPplh0Yq84x7zvslDUOeVTGbXZ2HzJu+/zVK7LOO4jIfDzHvQ2UMf22y5ApX2ERtolf3DI9E2BP9zKEFZK+wgRodJ1wZ95IQEO6XkZep6mVSjqou71pxF7+ZpAU/33ZQj+kRD4SnkWHdsylfYQcn6UNW2W6W3LCKlYZF1HyHAkNpHf1EpVDGTZb2XYbLglEcsWKtJdwzi2CkdsxHa9xbiPsuI6P0Pwtejwe7qtv8h7mrSstMBN1fbgkRH4bbp4yiLkhhfL/pQupLovqjYQ04MvVkZBAI/CNpZN3fuvSJ+9Ujx1nx9FUcQzVXpWS89zPk9amfiwKr3w0tu0quxsrRT4Mb0SjBUpslNW9LXMY2B7M6Geezf6kK93HzRBdI36NVI1oQGWKhvyFfVsiO2cksoT47XYBIo4tiFvsWzneVU6IMSOil163fL+Mq7bVgp8xQ09wZTRQI/1MLo/lDoJfYBnsZMUKaaLsrNU0bAwL7FZd5qCBlXjQwX+CCJ58PNmFoj5onZZd+g0MW9DGvpA8qfIfdXm/V7Ga+Q0v+yoIG2361zK6sGvmyYe27L3XxP3SQh5Pfe4DN/9lDadpG+KiJa7bDe8fJJl2LbeIs+P1n7oqu0XUdEUdVOL6eGSRpPKyulAMvRTFKU5jMrzqYn3JryrKu2k7mObdk0Uec20LoLv8l8phqxZPnyX6euVqwqfcyLGc6WJD0elXZRxXYxq1B7h96Msb1DrTBwQel+Kpb1WKHnfattogk2pLnx1BO1PI8STH0KoB78OWifwlbgZ9Ye7i5ER9Bsb5S5/rLUvJisnr2i02ePouFGGN7gLnY//59MUea8t83iN6jNhVLfbF+m+MWr7LM91rAJ/BHF58Hm/T6YG3u8T+cjj9Zf8bFKZ8iw3bVq6Hh8PYCspW6hnJa1cWAGQptPKgUiZYnHUH+BIGTbJ2O49TT62eRrfNnm7XRR9fkn7Ke++C/Hgh9znfKctutF2SBlbJfDbehEVjRRVS4uuhfT7nIBcmIdUHPhrbdsF5EqBxtcTcgGOTCQyi4CPUfRz0e4qIx+ngr8QQq8vvZcXRxECqSh879Nto4wG622myHtAVWkzi6So7W+VwM9DSE2uaZkgyvDch6zTp8V46IXn6o/5WESJryj3ma7MSkGo2HZF6n2Xr4I/iNZWdksiqwefL6NMQrL9ZCFteTHcz8t6exXDtuWl6LYK3LZWVduSUOoqT8h6VeBnINZan40iHiK+67Bhe20cWpYQS45k58m63kaTV5jbxoUOzzodkiaubZF6H1E+NuYn5uk0KvZLY9RsO0UlOOAUtd+qvF/GdqyLLk9s25eVLG+/fWiKvqojsBta2fES+FwcxrrDFTs+mRtcPnqftxlVVCRi9pdGQR5xXcWwLNP4euZDBH3o+lTs1xp51OdONnyj41XfS2OOzPoQElxq4vbVQRX7KY8HP21cGWRthI94Cfy22x9cPnRbxLhJ+8HXouMSz2nReVcEX6os1CHOW1EhyCOa+XBXf+h/n37fcRSbcObDab/Pfz7vxsawsPeN+iMj1mA37ZpX/Cn6zWKWZ1PIPEWUMSRQE+OzNq3MbW9EXuR1j/eRKnRVHg9+mi24TGdE1v0yshYd34PhspbELPhjf/CqOPCkyIi8rT9NwKcNcy3DVrashAh5LtxD1sHfBkhvB9LeGGRZd4Oos5Ie2/22bqrcH1Xfu9sokJtGWcc7S7R8lAm99kbWopMlapLWUDQmyooK+frfKXVlnYm6ApHXp55HvId0fad1/QcAWF+XtyON8fHhfire07q2cXx5rml5xcFHtLsi+y0W/GUT9fVcAGX48JvowQeI1+KSxYYaU/mzUnb7tap892mWnBB9m1cL+8xva3Dsy8hG8JV0NMpeAKHR65CGqyE2mjTR7hpuWw4V7UniLodrmAQK4dXV4X7sdk7dGLECYBP2knjn6+HT2cpjE+tc9LsqAS2P6ldFWwJNZVPEfqpD3DcNKaGDPjvbRR0e/LwEC/wmXnwSWQ5IWdkKioi+VN2gVVqf7aYWiwe/UIrIFJOlsapvRN71XxL0KNiTxD49DkNhz0W9bzTfhc1bLwl8Ku5twh6nswn4iYnNIt9VWaD/bb59Xhlw5eFXsZ+ZOhv9lkHeBq9l7QNpuWXev9OeNTG4CdLK2HQPfpnHt87nf9M8+HnRCH5BZLnZxHhC+BLrCV0KebPBZM06U1RU3ibyqaCXflTIu6L8tCtF9UPpkOtIitBjlwt5nJ6Pw3J02PU5Pm7KbRP5KNp9Gt/S6X38+0iaj19RlOioK/uQooQQJPCliMkovypt23a7XjNKNzQ+vSsbkas/CrJaY2zDQvqLiNBj/9qa6aK1BQX32tpm8b6+PjwM5+VdqaKAlQNX+aV+is0yQ7uSFUeKto+Py+J+bGwwL/Xz0/ETE+75cRqAQddWGbBVAnyj+ir0R5qiffhFevDrTKnJy0CHV420Xp+U002Cl7nIY17n8z+PB981bVqWwZDy+U7nc14FCfyRitp6kOfiLSrVWJHHxCcVpi/Rnyu+gt5XvPraU3ztLKHi3haNp2KdDnMJfT6v7S0AL3sRDWl5o1Zb5J5bb6SI/uSkmQcj9LiddDkTE4Mo/vq6fV6cZ23NT4S7LDxSf9pwxUmMDTKzUNR9s+j9UPd+5Y0Nm0DUz78A2rIdAOnncYhFh49zWa3r2oeZLDox1KKzUvSO9o2a5PH8u2qJdXkhfdojRCPyfcS8S3iHdouahkfLXfYbLuRp1zacC34a9Zei9tyGwy04tv0sCVbJhkOndVlycH5XxH58fDjqzsX1xMTmH11mt7t5WVghsFUqJDuPJNjTrDuuSL/ipMwodlWE3Dfr2LayM6pwXM+hOrZfqmQ03YNfxXGMcbvzlClv5h/bm6AiMzPl9uA3qUZdBWXsiyhEcgBFv1nIjE/E3WZ1cUWrXdP6dAHcDVVt68N5fGw3VLRj5YCKe6w0cKGPy7WVxddjz/3uAPaUl3SZLmsOdvFHI+6S2Oe2Gr7ubndY4FMBj8K82x3sV5tnny4fx9P1cLHvG81PG6ek0vZnU11it87nft3PFh9hFmuaTxtV7NO6K2ZlE3pN+FQU8+It8HmNVDohmnRCF4XvttZ9U5JwZbdJy3tv899HgcsW44qCA9h959JPaoTqWh+AHP1Omw/XhfNwYS9F5lHEpwl+bu3h3nqpUkKxiXb+39Vw1jatKwUmFfV0HBfePKqOTEzIAh8j9hjBX101dh2cZn3d7DdaIUARjsOojSdNoKdF89PGKSKj9AxCqooU29palYXkZ4/l+LrcDLGU0ZcqU0DGohVDPPhp87qm812e73jfZXoLfEmgSrXWmCnzREWqTimWh6wWnCzLLhVJ0EtiOU0Y02lQCEvz2PzufH0A9kg9/W8bRisFadluuJDnw+n2SKIeyyiVhcJFpi1DjavBqQSdNknM8niFgk4jRfBxOP7wmEoienx8INRpI1sq5DFij/+50J+cHI7o8/XTyoVt33DxrtH8QmiDVSeUqrYPn/tVBa1cwcXYjmlTHQ11aJQY9lOeSlne4K607jTrc6jeztTINoYDE0pVJ3CsYj4v0nEP9YuVum+oGJWi8FTIUxHc6w2Px/7V1UG0Fqel0e+0iD4vU5b/fHl8uySxzoU8LoOXXfL1u+CeeCqq6XAqjKmQRdHuC604SG8OuEBGsY/zAGx+u4DLQSGPZaUe+27XzL9ly6ASMTlphtP9NzExvE10/9CGuzZcjXDTovkq8jPRpGAU4vPQr4M621lJgr8upDfaTahcjqqgL5O0yoLP9tfqwect2dWiM0yTovcSZfkIC98HNksL//V6Ruhil/7v9Yx4W1kx0y4vD4bTeVHgu9JKhjY6DdlG/C/ZgWz2Glc/L6ckGKWIOLfE0NSTfBqcD7tpDWqLQNrXuL38fEERzq021F6D4n1qylSO0KrDbT0o9KlVh79pcEXz6b7gx8Im5tWyk5kYrZJ5qKPSYrNpVoVk2amLpiYcKdv7LdGEik8e0uw+PudriJ2n0Ah+mp0j9gNV9s0o1u0P8dXT6X0iNLU+KCV7DIpzFN8o2ql4x9/Jk0YAopBfWtpcAcAoPn8LgNlmuPfeF5ttQ9pG/l/yyHMhDyA3lLWlseRinpaJCneeaUZq4Mo/OMW3LYsolfavzzDJ4sQrHPyNA4AZL3n3pbYJUuVB8vy7yGrZUYLI6rWtirxlooG3Krat7opSbMevydQh+HE9TT2ORV2veeZPI7MHXxKOMR+oKluIx4TLVy+V2faWJm37QipQwZUtLqKp2Ka2m6UlI8xpd3l50E/HHT9uovcnThjBjpWAlRUj+lyedRdcjLkahdoi3gi3tLga5rreItgasPLGqNSugrngpfE0Ii1lsOHb4SNQ02xLtu2l28zf6NCsQ3QcRujx/+rqYJu5pWjLluEIPkb9u93Bf2rHQZEuNb718efTZaRZdjSSH4QrUBHzs8tFHc9hXlGqmiZarWKjbr3ShuMWasnxsezUatFxiXxltMhz3IPFPe0CbPbLY9R9YcH8X1w03YWFgcDHLv7HccePGwG4tDQQ9tSOY/OqS0IWQM4AY4t0+9hbfPePrZ+LRZewn5wcFvhTU8NC30es2vzv3HrC8RHsdBgVvzgvjYDTZdFKAZaL2q6wwS216OCv19vciJbD7Ti0YoZCn07LrTxpXbrf1JdfGG15drVBLCnlE8v5Pgrnq2sbq9TNhVp0JOvHKBxMANn+Urc/kZfDZdHhlhxbbbMMu86mfeXy2FMLzeKiEenz80agHz5sIvDz88aCc/z4YPz8PMCjj5ruiRNG4K+sGEGPdhYfeCQeYPg/Fc0omKWoOKZhtEXHQ6P7Nu+7FIXnXnIsJzYwnZoalJHac0IEvY00Ic+7UuSeN3TG8fi2hQp92viW70PaeJnvbxqF51+7xX23sTGcG5/nyqf2LjwGOI5XKCgq8isn9uwsNur04NdFU45NbNhsOFVrlbZ68aluku4jRWaeKtyDr2wmWn86I9ZMDCI2kYc2nOXlwW9x0Qj7Q4dM9+BBI+wPHzb9R46YaY4cMb+DB42wzwPadlygsKYiH7s08wrv4jgU5fwDUJKI8xX2XNBTkY+VDToNnZd77/n6fMDjSSPa3G5Do/Lck87tUVTI86g+jsN+6YNctnJLZcIKAJ8Xu1I6ThTuq6vDw/lbBtovTWc77ooygkT1rGoJMWiVUQoGV0mQBx/AHRm2zdNWXP72OrG9bbFdRLZUX1V4OTuQmHX6CPvFRSPmFxeNUF9YMNH4hx8edBcXAfbvNyL/6NFSy2/fsMS8YQAwFY5OB2B62ohqahOhnm6XeEd/t4/NB201U1PDYn562vRjd3JyeBr6cSf+BViXX5zjahvgitRj1zYfnxZFNQpoKobRwoXT84g+woU/b4SM42nFodsdjtDTXPjYxaw6dF6cDqeh+9THi6+e/NYhRfykaVzUkbq6krTHFvg6264x8hCTJrFRxvHLkr2mCNIyKmVJLWubttQ0mU2hjhtfzPsoJA9rLVkYuN1iY2PQKHZx0fwOHx5YbubnjQ3n/vtNhP7ee82wuoR9GijqqB2Hp1zkVp40US/56Gl0fnraDN+yxQzDH/+SK84LECbspTYSdNz4uBHCmJlGms5nv/F14LJw+VIkHNdpw5ZVSJqGVixwXfzNA029idNhRQBgWOQjtMwayVcyUvWzLobnnIr75lOFJa6O86So6zHvdVaYB981bZYNbUJL+bo8my5fPR1vy6oQ0u+qQRZyk5cEDxX2Bw+a6Pzhw0bMP/qoGfaTnwA88oiJ6P/f/yvbMGJhfHwQLUeBPTNjhmNkn+dXl0Q+jaxznz8uf2rKDJuZMf24ni1bhiP4/OutAHZB7/LES+k4JdEqfSVXiuDbcvVL0LJSWw6WCQU1vjXhaS5tFQ5ekcAPnnEvPEbj6Tz87Ydt+d3u8HhfLz6P5KsfvzBczzSf9kt1ULXQrkvYx6oBYiYWz70PReq9KtuihHjuo/bgp2XQSROdbafMG30RJ4WrfD6vtEo5lpLgw3SV8/PmP1pyfvxjM+zhh03E/oc/BLjnnuLLVDTT00bMYXdmZtClAhwFOvXA84a3UjpLWmnA5U1MGEFPhT4V9nQdvLEsgD0dJ4+k0+i29HM1iOV2HNfXf23DXPNLApdG3EPeJnB7EfXYU2HOI/mSBQdtV9SaRZE8+hwfka/kxuf1eMg93+cZEXqfrVqs1SUOm9b4uS6KsH0pfvgEQum4Is/dStJk+qyszRdkkRdKGRUkW43RVdu09UvLtpXfCylyj6kuscHs8eMmUj8/b+w3jz5qxP23vx2W+aYupqcBtm0zXSrsJYGPkXyefSfN0oNeelqBoMudnd3cgBbALhwlkU7HIyiq8Tjwj4Dxr/5y0c/Xx5fPhS9fNyJ9fAqhAlvy4ktC32aTwXXSBrXULoV2IZyeNkjmDXTROkSXxSP+tsa4vsJehX8pFP08K+LBX/Uzts4IcBPe7NdNkwW8q41g0cstGt+KfxYPftbyF+LBD4kO51lPbMT4qotia9SRZtGhw0qP3FOvPY3YP/igEfj33Wei91//OsBDD3kL+6pu/cmePQBzc4MPbQEYYUUj9zRCz4U+Cntu1aERexrBp8tCcU/fEGAXrTouYS9F1LkIphF4PFb0OwFU2EvTUlFPKwN0vbgugHRrjkv4u4ZxkY/efb5PpPkQKQUnbg/339PtpMvA48gj+PxNCRfotuF0nFIo0r0vNosOlgFgdNJkAqi4TyN2bWJDChw25VinWXbodHmWHUIhHnzen+fEKvukdGUDkvr5vLbIdVUnoW/GIh8RXyTex02yOqytDRrRYirLBx80kfu77zbR+/vvt6y3XjoHDojDk8svH/ZZY1R3dtYI761bh/uxESxabvgHpsbGhrPd0N/09KCCMDu7uQEt99ZzEU6FKBX7kmCXuviBMDotFfU8is8FvhTBl/pDkASvTQTzdKQSmMJUivbTD2XRZfAuTgcwiOpz25Ntm30aO6vwLw2X2OdvSn18uWWWqwzqFotNEXlV4ysqleLxsUPRcUUGuxvpwa/iJhJSUUmblw7Lc+MOyW6TVjZbZCnUgpOlDKnTUhGDqS/x67JoxfnhD00U//bbTeT++98PKmM0XH652cYTJ4yAXF4eCC4a4Z+bG47od7ty2kqe+lKK3HOvPRX2ksDmgp5H47HyhYIdG5vSihkX/Ph1YSrsucinKS4BNvvnXVF5qSEq/S99SZiOd/nbMdsPLZMNWjGh0XiaM58KfOrXp9YnKYIvlY1bdzSrTuXYnnmuN6U+Pug8z72qIrX82VE1tkbPymbqrowVSdZticm2lnaf8Flu1oCtevALpIwLq4z9F1KpSStHpgcMFfg0/eWRI8aGc+CA8dlb0l024YzqvOc9/f/J//yfZht7PfP1XPTZo3cePfpTUwPhjtPQjDf4hVmcDsU9EfjJxORAQxNNODY2DuMTTGRSpMg9t+RQQc+FPO1fWxuuJEiRfbo+bs/xidxzfzttjIxi2hYlt/nbsVIgCX3sp7YeKspx/2AefOmtxdjYoFLD/f9pby9s3nsV+ZVg89fW/ea0qnXw9dUtIm1vTRQFoJoKYZnXflHlzezBd0Ug+DifwrpqP0WSZtFJm9c1bRk3Ptv+Cz25fBqAZInS2CxLYvSe+u2XlgCOHRukvbzrLoAHHgD42teMHacJjWh9+fmfN7aj5WVTien1zH9s/Iq/LVuMcEe7Do3KY9Qf+8l8q2sdWFkBWO8NW+kBhoP/Y2Mdc4S5PSrvjwp83qVCnzfA5WIfxwHYI+iSoLd9G2BsbGCtofPYPPq4byShz6FR9BCrDp0HKyGYzhOXaysX7/ex5GgFoBDSIu6+Fp0yyxaNDbMkpLfTKuzjaAdSBbFtY5o9ildCeb/vsqV+XzJ78NtKyGvWqsrT+PVwaw4K3BMnjC3n4EEj6vfvNz9GXJd1OJ3LLx/qT265xTQgxvzsqMLRZ8+i8v0uTXmJwv6E0c00UA4wHMwW9Z1kzeENZ3mXW3Rslh3JvmOz7LhEvgQV31Tco5DH8SiccWegUE8TvLQCQBviSvBludKG0qxC+J8uh/+3CXtFqYG6noF1P3+VOIhB3NdRkWqVB78Oimw8HLK9rga7tJ+Pz0raCWFrMGI7lrbh4j6glhzMlHPkiGlI+9//bew4//zPWTarmTz1qaZSs7RkKjgAwyK30xnOuEN8+usbHVhaAlhbGnwygNvqAQY2fhrM7gDz1fd6w5YabCjritb3emY6FPe9nvm/smLmXVkx20KXLXn7ubCXctlLfnkesaffBEBRTz9sRRsr02g+XSaHi3wAu9DntSpaSbJ58AEGdh6eiSfEi6/Uhi2a7BudK6MsZVKXB9/XNQAga5BRoQlaKw9Ztq8s90eW+ULvE3zaSjz4Svm07sZEBSNG7hcXjbD9yU+MLee++8RZW7Yn+nSe8hQAAEi++tWBEKb2FWpPQcvJxETfioPCHmeVIvd0EX1NK0WWue+eR5ulyD5vgMs99lIlwSe1po89BwU3Fes4/cTEYD/SnUF3TpIMKgBFwqPwNmEuDXO1PVCLjaIoShCt01EZiSJNZln++yzZaXxaQ5dFmdEH34bQadF717Sb9g0KueVlo0YXFkzk/vvfN1acf/93gG9+0wj+UeTcc010fnHR7BeMgC8vm/Fzc6bb7ULSnYLFebMbl5bMrsVgOdWFmHgHM3T2g9xjCUDP009PhThG9nnUP61Lf1hQLvix8DSbjm/0HiP2KNax3CjceQSeNrjFSDtfPoV79aldh65TymrD7U9SBJ+XU+pymw4vK18vn49PpxRGWhs0fr9tQ4S17m3wcQ2MqrBru5UphuMquRwkrzyfh89vm1dal2s9hVp0mk4RF0DZF1BMF2mazcqrnDx6jwL/4YdN5P6HPxwS9/VfwtXS2bdvqD/53/972CazujporAqbdSMNtKN+Q2s60v/vit5TAcmj97bIvsuvTzPurK1tFvc0go/rxDLyKD4X8fRDUtTSQkHrC3alSH7R8P3IDwwvJ43a28qTdZxSOTHdu9tKiP0pBkFYJW0/95rakDgtYJxnfh8ye/CLIrYTM6Q8ectu8+FXgc1zb8Mn8w4ADAsZFPZLS0bc33+/Efa332689w89lLX47WTXLlMBQmE/M9P3s3e6XQAYh/V1Mwp/VEeim4fqWLTze0fraYpMLsapcMeoPg6n2XN41N+WcQfXLwl8ihQ9x/8AmzPdYKQcu3x63GF8XSGCmVY6+Lz0i7dU+PN+vt1SbnsV8tFga2eW5pdtC7G9lfDN5Acw2v78tpBHpxQBz0TouvalN002D34RWRJteD856r6YlWJxCfa0xrjSuTA0jIaZ19ZMxpj5eSPo77/ffJmWiPsOjF70XqLzP/6H+cLt5KQRrdgo+VRaUUyHj3DrugS2RRVFJo3Q84VSfCL5WAnAaWkqTBrVl754i/NitJ//aNYZ3i9F8GllALvSNnKvPxfj0jJ9kfah1M/LJU2rRIFXQoGWQy0DTUOFfTtI+ipk868uXNcDv2ak+0hZ95ZKLDpZas5V+9DpNL47N+9bjdhukqGZEjbVVrmIxOj94cODj1d973vN/TptFfzUT5l9tbho9l2vZ7z4GxswvXNr351C9TXX6ACDrJv9NJnURiOJcyp4XVYejNjzYTzCv74uR/KlCD5Pk0mx+eMBBvPZfPAYYacNc7mHCWBzo9s84p6uLw8auY+aUA9+GdSVsi/PcyskG18epDcsdHibaFLWwizEeMxcYt2330fU+3r9bVTmwY/xIEnUbdGJAR/bkPOmgtYc/ErtwYPGd4/R+6F1KZTO4x8PAKcy7GBr2qUlgLExmNq9CqtTk/3Ulwh1eAAMXClD6TFtnnsOtY/wVwS8AofTYKRdGk4j+XzdttoJhzcqsH18yoYUqefLw23k4jw0cq+ifCSxCUo6PsZ7fQguG4LPfLZ+SlH7KC3pR9rxahJt2IamUcb1XEZFrdZGtjHc8OouQywXZprHPnU/UfGHOe8XFszvwAEj8PfvN2IfVNin0XnmMwEAIPn85wcNkZeWYHbbaUNWHapduU4eit5LfnCetSZN+PMoPxXwNmGP68Aund6VHtIXSVBLFQDa2JZDU26GCPoy0m0iRVYUtNKRG9/7dJXR4jqEnU/lpQl+97QKABJr+Tl165iisQnomCozZZXDdQ8JrVhUJvB5zR89U3WfmHWXIYYT1qeRhzRuyJqDjRox4ry0NPig1f79Jte9NqoNZ27O7NOTJwEWFqAzMQHT07PQ67mD2TSaD2tMvKcJaj7eFQHnhEa8AewpHukG4ldp6Ueu8FUG+pGwS5eZBq6TZu2hSDn5cRqXuOefEeblkYa7hknL90XtPrmxvdWs8/5dx3pjeF6VQRMqJRJZbBtNIVSXVEWWt1h5yKNR9a5fIzFekGllcmbNweh9r9cXpP20mD/+cQmlHQGmpwc+91ONbrvdU5lxLAyljQdLlN5H8KdF9W3wtwM2aG563qU576nfCP/jvPyrtgCbhbdUC7LtC97AV8I2HOHr59vFc+HTsocI8ZAKgVIotGGcbzQYYLiBIB3mS50NCkO2MytVbZe0/+tuqKk0hyq1W551eUXwee3W108njYutFX6d5YjxZiKVyZk1Bxs00pSYmDVn/36T6/4b3zAiH3nSkwDuuqukLWgZZ5wxaJy6sAAwMQHbz9oBa2tmF6P2R13Hv+cEALI9h8I/6oQpJKnQ5tNLoBhHy4stuk4LyD35XOhSod//ahf50NXk5HA0X6oMYNmk9dBGuS5o2en+oKJd+tHWzrb/fB1S9N9mL/JBRX8h+CZpcJGagSyFtPR6ZSKlCeT9WVIB0mmreh7nqazE+NxuOjG8EQvBlrbSVydLb4xc/bZxaRRu0anb8tIEmnACI96vLVHk45dLV1aMRQf996fSv3QAVNwH0HnCEwDgVKPbXg9geRnG11ZgenoKJibMICdFpl6UhCIKeprBBgV5kmzuAhiBzQU/Xz6tsaCgx/Vx0U/FMv9yrQueecdnWgleGaGCnFY4KLTC4bLySOtSwV45TbpnV0XaPvGxeTaROitZnJi1lq+obXKWo9AANw6vilob2Y5qZUBqj9AUnFlzlpZMg1AU9t//PsA99wy+PLpzpxmnhDM3N0idOT8P23b/FMzMDHY9wCApDQAJvHObjRS9xy79jx+X4v/xS1oouNfWYPA1Ldj8JVq+LiqU01JjSh58btmhkXse5eein0I/VCVF8NNENF0+jcrTNwr4wzJMTg4Pw/+8UsB/vqjwL40iRJ1PdN8VzYvtOZEW/HGVuWofc1uJ2YPvmyIydj2Udh7bpnFliuLHzXfeUEoX+KMq4n1own7xitzT6D2mx1xYMB6SY8cA4FTkXsV9ZjpPexoAACTf+Ib5su3aKnS7k04vvogkHH0ErmSvoQIao/I2i87ExKCiNzHhzmoDMNyQlQpo7reXrCy2bbAhReelBr9YLr5uXpGQRDrfDr48177wje4rjSYte0YMlCHAVCMUg+7H+shzTZR5bRcq8EMLOuonYyw37cxQcY9fXD12DOCBBwB+8AOTNefo0bpL2S66XbOfFxZg+/YdsLw80H00aN7Xmy5rDs86w33h1KeOXW756XbN9DwRP06D4n91dRD9px+4cpWNlkES+Wn9dDkUrIBwLz4tO4W+SXC9PcB9MTk5sBVNTEC/VTTmN8VIPhf+rgh+WteGVgpyUeY9Ou/zr47nR0glpK0WHYrPN2OqXH+dFBGFb8t5AZDPolNUZU0tOko+uMjH7Dnz86Z/YwNgzx5j2VHyMz5uxPLGRl83djrpiV2cUEGPQh3FOfrrqagFMMcbo/JoTVldHQhmHMe/EEsj9z4NW6VIN5YVuy6xz7fRZz/Q/3SYTYzz4VTc8wqBj6BPs+loRH9kaZMAUopBrU7FE6NVKAu1CvxRpkk5d603jo2NfsNPWF42ov7AARO5//GPAU6cMILm0ksBPve5SsvcWrZsMRH8Xg+mp01weHx8kGgHwEP3cWsNF4zr68Nee0mId7uDyh0KV+xiZBwLtbpqKgpra8OR+7Q0nFxwS/YYHlm3TUu7Uh57W2ScVxpo5J53MXKPNS8auafRfBr1x3l9RL3rwKrYL4Uy79NNuPdzimon0FYPftUNcKXMRiFZWXh/nmMSmi5WKmOsFCn4fZbVGA++Yif2kxoR37Rw28bamhH08/ODxqBbt0Ln6FEV9wXSOf98AABIHnoIOhvr0O2OQ7c7/H2pTZrZJqKpoKf9UpdH8PGYU9sNgl77sbFBDk/aGJfadKRP8gIMR99tqTNxnE2U07L7LFNaBvZzIc4by6LAp5F63piW9vuIe0nsS0Jexb1SEUW1E3Cl3FSyYUs1GtKfJWWoT7pHW2PgWDRQbBH7Vlh0fIhtxxdBa7aH23MOHzb++5/8xNh09uwx/UrxjI0BLC3B9PTWfjad9XWPb1NxIYtWHGqZwUawNHovfSUWBSr/kjG173S7g0g+9d7TgvKv36JHnuKKtkt+/ZDxrooBFfQ0ZScV9hiVd3WpZYdaeOiypPVysU/Lq5RKVfdpzcOuIj8rVe8zSdCHtsUo6/z1yU7jM64s+BuWKohe4JexM+q+kTTltRRl6AZMP5yENp0TJ8wHrhYW+h5x2L273kK3mY0NgOVlmNluBP7yskdOfBSJPP87z0uPXSrEMU0lCnYAc5xpP82Y0+sNRD5WFFZXzXJQ0KPgn5z0a0TgisDT/yGCHqeToubUBoTRecmawwW91KCW9tN5s0btldKp6mHcpOdAWdT9TC6DKp/zWkGKN/MUUsfxqU3g2w5G2TfVmC6CRr+doKkxT540Av/ECSPwl5ehs3+/+ZKtUgqdffsAACBZXoa5uSk4fnygV4Oj+JOTZgYU9LgQgOEoPJ0ff9iQFkUrRuvpf+yikOdf1rXlznd9XMoW4ZeEsUv424Q9997zjDk8x70riw796q7k35fWL5VPIk38+3zES7HS6Ht0AWTxyrs83qNImZaUsj34Psc+i0VHr6lqtr82ge866KNyAjRtG8Uofq9nIrOnGn7CxgbA1q31FnSUOJVNBzUmTVTTRxKKkvDjH1zC6dFHT98AoJ8exTs2rsVoPW1gSxvk8h+ALPB9MuzwsiNpYl/6LzWm5dYcLsxtjWy5116y9fj673nZXfugpdT1MZ+m3Z/LwOW5dwm50GjqKFQAspxPviJdmidLf9q00rqbYNEJbQBeJiNr0ZF2bNvEfmsyB/CPW6FHRAV+tWxs9LViENIMPKUljfBjHnuM2mMXp8UuNgbg9hyXBz/ta7t0mG8k2yaOMTqP22uL4OO+cFlzaJfaeKhVh4t5l7D3EfUjgu2eX9X9M9Z85jE9C4sUbqMe5af4fNm07n0Vw3kYqg3r1pJ1nOPRCHzbjTuGE6lo6r44QxHLi6IMP3C1sGCGz8wA7N1bbQFHmYkJgI1htwd12liFo3VZG8NfmqWpMG3ReirsaUtfl7CXPppFhT1veIsbhkgfsALwS33J94ktep9F6ON+DBH2tExSefl/qd9Fw2w6VX4IJhZiec75RIiVbITsz1j2vWTfAahfLAPEc834MlIefBdlvsKpmxjKUBgo7Ho981tfN9HL7dvrLtnIQfVh6oS+cK8PCnochkKfZt+hue5tthxqyXGJ/DSbjmvbbPYc6Yu0OI7+aMYc3DabWLfZbnyi9a5KiLQtUr8PDRD5off9MkVoDAImK0Xsk1Y9p5QgsrShGEWLdROIRuDbTpAiT5YYohKtsOhQMYYfuVpaMsOmpwF27qyvbKPG8jJs6W7Atm1bYG1t+GOy6+sA0E2J3NMoPW1YC2CPwAMMf9EWwKwYVyrZb3yFPRf1ISLfR+DzqD0dTsdTYY/dkEpAWqQ+i7C3DfMlUpGf19oBUPz9tG6BIkXSfcuUp+xFR/Bj8T0r/vjkxXfZs8pOkuKjDWM8n0baoqM0EGrPOCXEOh/8YM2FGi0627dDcuQITE9vgelpM8xLE6PQ4952mwjEigAKfWrnoWBU3+Wzp11qxaFfuOW4NsoV4XalzJREN06PaUGlLDeSsJe6oVacMsV9hBQpBGII3pQBF1ux5ubPS1uPnxIvo/CmIRqBn9aoKg94IGO5gcRSDl82Zc9BuGiZmqq+cArAoUOw9Zw52Ng+CUeODNo6JwnIjVNRrPP/PoKc/0eBTjPluOYBSM+aE2rNodhEsk+KTPrf18oDIAt6uv4Qf33LRHyVVB11LpM8GVdiIsYyKf7Y3pDVJY5D3mKlnXuxNqQvkmgEfhXEIvJjKUdmuMjHlIEYQlaq5VQ7iG530u3Fl6L0+N/XvsErB3Q51LPvsuCgILbZcqQGtj7QBrZYPtqVhrlsO3S8S+CH2m+yCPqWCf8YRLSiKNmI/fr10VcawS8AXyHryuma9yC0/SBWChdlKOy3bdts11CqYX0doNeDLdtnoNvt9LV6P9GMS/VTcc+Ff56u6z8X8NJXbLN67xEpLz6dNk83dB7Xf1v5fcaFEEEFocqvwuYJoMTw4A8pv5R7PrYAUuj+jK38Sj24PtzFx+dZR1ZCgrUj7cEvs/FpDDfs1kAjuABG1He7AFu2DHzLSrUQAcuDyJsEPB8mIY0LrQi4/vMIvjRNKGmedVcUXxqWRbz7ROjrEPcRUPX9t+lvSX0+DNWkhA2+jTXp8CZslzLaxH6OFirwixLSRbbIxgMQ+4GIFXG/USEzMWFy3+/apRH8ujhlkUqg069vTUyQ+pZN/HJBzUW77/+QcT7Ds5Im8nl/yH9f0e4j4NOEe5HCvkWVhFBCBHDTAkBNK68N17Fpc7rsWIm50lhWpa+qyqR68E9RZJpMPHgxn7iNBMPE3a6x6MzMqMCvi1MijgboO51TgyXfvdTvux4p6w7vl5brI+bz2nLSxmcV5qFR+Cyi3ncaXyIR9jEIUd883nyeJtCm51kZ+7xN+6cKQt+0VElZvvqqzpGRs+ik5Vttw8XZ2i/0YvSe5maMRFSMHMSTg4eFtze1Wmx8lo1I8xURnS9S2KdN5yu+fa00ocNDpwkhouuv6fc3/qGfutbdFlS8K2US0oazyPMmxmw9lMIFvs9HEmxIX1DLWobYI/ZNaRdg3Y+oJNETMjNTQ+kUADD7/5S4w4/Jiu1qfcW9ZNWhw+k4W7S+TrGZ1ddexrgs0/kSkaCnxHhfC22AWuc2tPHjUGVYcdqSSjRWYryOm0idWs9L4LsuiqJaIEuZAIqkKYI6BlL3FW3J2e1WVzBFBr+gCsaaQ3oHSOLeJdLTsFUCsi6vDIoU3iHbVNb2x7JfHTThHtuEMvoQexDLl9Av9eahzGhuG2jLtYE0dXuKyioZHMHPm37M1l+29yuGA13V55zzkiruUSji10yp4Feq51QEf2zMNKzdFEC3Re6lhrZ8PMcV1Y+VPGUcYbHuQ8z3MYm2fAirycK07v3X5H3XdiT3BRe1Lh3J+9Omta07K5JlJ8syi/LrR98qMq9lp46GDa1P8UXFCYp8PlypDtK4mdazNol8gHwCvW7rTR6qKHdT900G6hZpWSjqnlxVxLlNz5AYzpc27c+yiMHpULZvvujl8rL6BnKr0InRCfysvjrbfHpRl4RG7eNhehpgbAw6kMDERMr1E2M0W4mSuh/0RSNtT1FvpMvYV23y4seQncXn2wKjSp3XekwJVnj0X7KO583qyNcV8kZC6ncRhcDPUpORXtuE9CthWPfdGE/XolQOieC3LluTopRIUVG0GARsE4npudyWNg2h1LH/m7KPyzg/q9z2KAS+yzMlNcSVdpDrNYmtv6gdLZUxFsq6gW5smN/aGsD6OkDSjOu1lZxc7sD0NEBnY31wYMSWttm+LVX096iqpuwXD/piozk0RVi0idieiRJ5z4uQbYztHKyqkhWrRpKQ9GGeoDF/GyC1NZD6+VvC0HMnCoFfBDFFAtoOasiNDSPu19cHqRmV6llbM8diHGBwYPD/qTcsRX1Mtkli35Xox3deH3yXr9fH6FHFG7XY23w14dlcZRrV2I6XfufBnzQ7eFEfaPXBZ/m1CHxXVF7aQUVYeEaVIvYB3ZdUO66uml+vZ34d3d21sLx86jMEa2vmQLAIPj1mlDyif309rIxFveEJPcc2ffBLIC1ZUFEZMsuuaCjphAq5oinLtx+baOQ0ycI0Co2nq47Yx7DNFFd0vKjvK9g0aJQWnSJeF4SSR6THeOOIgSz7lE+PQeIkMZoSRb5SD3g8hiL4LOMNFZe+Qt8m4tPEel1RfkkMr60N/mepgLoyifpkEbVN50sTs5LGTixiY5SDUKO63QCj6/VXqqewCH5Wwe+TEqmoj2kVMV1Z8xdFkY2LXfOiLWdlxfwWF1V81AUG7QFg0DCiL/I3W3Tof0nEcwEviVbfYSHjs2I773yHr69vHoaVAdw/0psAV9ZQ6XtivuVzUeSyRo0Y7s+cIiP6sQnHURbxPtT9NklpLr7XViUWHZf4lQpapFjWC8dOrjckycCmc/IkwNKSioy66NvuJ8aGI/iWaQGGhb1L0KdF+6u2/aRZbqRzUPq+F++3fcoB+/FNAH0LMD5uj7D7WHyy2oBcy9JrsJm0UQz7JMNQDLEECJV2EXUjW5sHXy+G7EhvS7J8VAIDxb2eEfgLC0PZGpUKwUxG0B3bFMHnbW5RRFNRbxP0of9dlQY+bcg4yurqcL+PoOfWHFpJkD4KlvbfJvq54M8i9osS+kUsq23EFuFGRkn0ahrf+hi1fc199XVf93VY8kqRZEXmFralK5LWGVqWGA56FopK1wSw+TVhAp3UZVER0euZRp7z8yrw6wLbQ/R7qEUHhjMeScJeaoTrGiYJedf0/H/oMBehEfuxMVNJkIQ7Fep0uK3Lh0lRfl6ONAFeZCS+yR8eLpqY7/NFP/hjrcgAjJ7IDKHs4zbKbT6KJsZrSyJIkoUI4jziOfadZ2txzf8XaTNypWbynT7P8eBZdPD/xoYR+OrBr4++lkeViSJ/YkIU5TZh7xL8PPKfZxmIj9c/BFfk3le0j40Ne/Kp6JemRRFNxTR2uX+fi3eXmC8yoj/q12Xsz5OyqFPoq5DMTlMDjzHT9P2Z9XoKEvhV7aQ0S07dF0DV63Yd3Dz7wueksS2b2hSWl41Fp0k50ttGtwvmQKytiRF8Ksyp+LYJchrtT5tesv6ECH9+3oSm4EQk6w397+pK0Xv8Sf2djntaOnxtzS+qbxP6KvKVrNT1nExbr1YA3JShcdq+z33aejZV6Gd9+xK1qcJ3g9p+4taFdDFwmwH14i8vV1g4pc/Y2Cl71FIvtYFtmlj3Efa0H6d3TUuXFdIGQOqXth1Bf75kpUmLxNOoOxf8VMDTeZJkMB2K6LSsOp2OWUdoVh6lvYzi80vtIopSPoVYdMq6UG0e/DR4eWw2mrRlVPn5ahehFp2Q5QLI5XSNo2KICriVlcxFUXKwcyfA5PJx0xCC1bKoHYSLcyn67hLztALAl+daVh7BL/VzbJ57H089/4+ReS7qQ8cDDNqkSJUD6S2FLZpfpC9fGS1iiVyqmFeU7FRi0VGaiW+0xGcaKkKkKOR11yVwwQUAv/3bekOvgvn5xNhzFpeNuEfl6Aonw+ZAv0uQ+wp723hcFn87kGbXCbF82UQ+b1QrRedxOP5oFF8S8xiBtwl9GtlPi+hTnz6N4EvRfI3wh1O3sI2FOmytKurzUbcVuShi2YZYyhFC3nOgEIFfxOs2Wz587FadXivmr/TGwtgYwMyM+c3NARw/bjTm4cN1l2x0mJkBmFw7lacUWzqT0LEk1osQ7aurgy8ZAwzb/tfXB293JLEv2Xpol//3RfLd0/88wk7rQLjLpC4V71TY83HYj920igK3BvFtKUrkj2KloIkP8zKpY3/4fmBSsRPLG5g8xFJRiaUcIWB5W+nBr4q23mRCT+bQkwjFDP7Gx43QO3YstKRKLrAhxKnMOWmThop7Kt65gKcCXxL2/Rz9MOhK0X3atTWyxei/rcGqTyNbao+h0XxXWcbHB6Idt59G3HEcLRsO39gw3bSGw7QslLxR+1EU902irc8eH0Z52xWlCgoR+EVcqJLnu4ooty21ZdW1PbqtZXnuQ8ph+/hVAp2+GBkbA5ieNtH7rVvNdEtLxgquVMPaGsAkwCCsLgh8Lrpt4h7FOBf2dDgdZuvyCoBrvVh02uX/bdhsObb/NGJOo+p0HI3UAwBMTm6O1vNI/cSEWd7U1HDknlYK0qw9WAZbJJ9uh3ry20GVAlctOkoVZP1wZlHrzjM+RvC6bbwH32XRGQWqbPxbFDSCv2XLQFsuLtZbrpGDqmEWCrYJZVfUHqPytog9tej0em5hj9PZ3hQA+DW0pUjCNkvOe26hAdhszUE/PY3W43+MzuMyV1Y2r5tH9NPA8vC3BEr7aINN0wVPba20H5uOi0WzjBq1efDTDrorE04V1HlCxnQztN2kJyeN+Ol2TeRybs5E88fGNIJfJf30mKjKAcTWz64oepo4p6n1V1fNdNi/smIX+tIbAF6hANicltMFF7s8Uw6dhnddWXV4NJ1H2rGhLo/cozVtfNxcCzRSj+NpJQErCDhscnK4ooDlkUR+qNgf1YqBion4CHnWK0pTKLuSHoUHv00XaF21zVgsOrZ18fJQm87U1HBj224X4PzzE3jykwH+4R/ac27ExMMPJzA2BjA5tm58UWtrmz8NewrbF2O50ObRdZe4X1tL7+cWHcxTT9dHLSmW4gPAZn86F+l03rT0mNTLTwU0FfjUa4/9NIJPh6OIp+uQrDYIfwuAy+HTp4l8je43kzY9L7PShsajyjAxv5Fq6nlWi0XHld3GFx+/VF0nSww3H779Pv151wfgnyoTwEQfV1cHPvxt24wXH73IP/5xriIpKczOgvFDLS4OQuhUpTJQtON/KuQlP77rh6ujXS70aaVAensgWXMkJPsNbSiL09hEv03w8+XYvk5Lo/LYjpl67btdMxwj+HQduO00Qs899FhR5nCRL+ES+VoBiI86BVCVwSv9SOVoUqcHvyiK3oY6t782D37ahV33hd+0kzIv/Obv65/EqObkJMD27QBnngnw4IMmk86hQ2WWeLRZXzcVKThyKv89hscRh7Kj/nfaT8U97/oIfUn4U5sPF/i4HYiUJYdbbyTRTgW9JIhtUXUJvgwq8rEyJPnjed577EdhT/d3kmxuXIvL4z59KvKxX4X7ZkICFHVSZ+Cqrre/EjyhhVIdVZ+DdboiRp1MAj/LF2bzvmoYNYreT1y8542w8Ajn1JSJ4O/ebYT+zAzA/fcDzMwk8PznA/yv/6XHvQj2709gacn8n5w4ZX5fXh72iDD1RwUzz15js+dQW40k8nu9YTvOyorpR08+drnFB2B4nQj94BMlLcKOfn8enbd9KZavy+b7Hxsz5eY57G2WHPpWC6elzSJQ1AOY+VdXB/57gMG6+PJwXprOU606dpr0UB/l52HW55ESL3oM48NL4PNUktJ/qb/ppFl1QrbXZaeRlu9Tico6zjU+5GaL0ftu19h0tm0D2LkTYM8e892l++4z7pFvfctrcYoHvd6gcSYsLw+UtkWpuuwb0genaENbGn3HYdSKg/2rq8M/XhmgDXilKD4FKwFSA9nx8eHMNwCn9gMMRDAV41TkS/tB8thLXnhaiaAinwprXGeaF18izZNPl2Wz6iiGGDN22NpWVf28jGnf+LQ3U5pFm97I2NwMIfOVQeg1XKpFpw0HukxCb25V3px9TySaTQSz6WzbZqL43a6Z5v77Sy3qSIEe8IkJGChv3/ySFqQMOwDDQhwj+dzGgx573sA2TeTj8nmRXRlvJBG/ujps08EKAG0IS7eTLkfaDzwqjl1qkwEY/qYYLQNOy6PzuA20AoPLlyo7tka3Utl42RWlDmy2G9dzTjVCO8kqkPPANYtLw/gEWW3kmdcX13URcs1U6sHXizlfbuCqIzDSWwc6jGYV6XZNo8/1dYCzzjLj/t//MwleDh6srMitp9czFai5OQA4OG92MA8lA1hD5NwLbmtsy/331G7DhTttWEstOlT484w6WJmwgUKepqdE4cw/GMXdSbQRKxXFKMhdqTUl6O7lIp/6/6llDctEv3JL12Hz7vOPEfNyaRTfTSwR6lHEJerUk18/ki2K73OpP8vxqOOjoVUI71DqLkM0H7oaRWKO4PuCggYb2qJV54wzTEaduTkU+Ans2QNw4IDevLPw7W8nsLBg/ne7p96OYEtWRApLC9jsMXS8bR76VVwcjtF5+qEsnI5+FZc3tLWtS2o0i1YcgIGwp2Kb/0cRnJahh67PNcwm8uk2SF1pv9HIPTba5cun9TRaJlcUX4kHH1Gj9pQBMT7b2k6WdhBp2fx8iMkq1nZU4AdQxomZJ+pRNvTiddXox8aMsMdI786dZtgTnmCE2X33mekOHKi0+K3i+PGBDeqMMwA6C8c2p8ekod1TCpIPloQrmXwou40kTnmUHyP0VMjTiD1G8rmtB2BY6HN4o1oaEaeeeMluQ1NY4n8e9aa2Hmm/2KCVDr4PaUNZniUHx3NbDhXqOJ6/hfBFEvujVgGIRTz4lKFt4t43IsynlfrT5sV+m9jMM21sFHU+u/Z5mUk9QsaFkHYOxXIvoNgqVWW1bR2hW7/iQ5aTiUbxscHtzIwRojt2mMw6dA2XXhrXRRcz+/cncPvtCSTJQOD3o/eomG2cEvhCYh0nXPTbPpTFh1FvPW/3yysJttz7POpNbUC0skHXgcuShkll5WLfZ9+kWXh4BYhvN98H/E2Ia9kUnzcSitJksogyn3lwmqwJKKqmA0mhZcm6LF8fex3EWikLocxt0Ah+ATT9VWvohUpr/TTSiI1q5+ZM94ILTGPbo0dNFP873zHD/+3fCij0iHDoEMDJk0bco/WpM3/URO9R/XKIcuxAAmNjHTGiTCeX/kvTcLEq5bXnopt2qbDlbw0QnpkGYDhSzz9yRQU7tglJQ1qHL7a0lXQfYTQey8Oj91IFB+fFbeQ2Hc2m0xxisuhUFaEOeRud5W21K7KZFvXk08YgTtMo8kOWEqHugSLSoud5s5OW1bBJ4HaVfQ/QCL4SRFpe/PHxYS/+9u0mgr9nT7XlbAvLywOhOD3NovcA7pAuU7p5I9VFw4W4ZBeyRcdpRcK2TKnuQ+080rZ2OoOfDT7Op0KRBdv+CZnHdz5FiQkfgZlmA+EClkfwq7CR5AHLHENZslJlBh2pO+poBD+QptT+Q6Db5OO7p7XPsTEzfGNjkDFkdtYI/d27jSi98EIz7K67AB5+uOKNazjHj5vGylhh2jqXABxaMuZ2GtqVVGffhz++KWLN/edpIpBH/6n/needl4pjg0fw+X9cNs1Yw0U6jsP18w9ZuaLd0jIR323g0FSdPDrPLUg4nM7jU8nSL9vGTWwefGldPs+ysrzZIc/RPO3UQt4cpE1b5fGKWWP4RJ19bT1Fv8nh/VI5Y963RaMCPydtOVlcN1zb6zYKFRroxUerzo4dJhK9b5/5ANbiolmqQWvaErffnsCJE2afTk8DbNli2jXA8vKgNbMNpirHJsaHUk0i9MupFN5Ilady5PPQtzfj48N533Ed2LV9fCpLlJmn//e1rfBKjpQ2E8DvI1W+aBRdqYqQjChFTJOFJj83m27JzULo8fLRE/ifLl8KKrYhqFrXNqjAr5iyfXVFYLuB+WbV6XaH8+M/9rHGtnPhhUZU3XFH+dvQdObnjeBE7/3WrQBTsGLy3mPOSYCBcuZpYQD6Qr8DCXQ6nf5k1N+N8OwtNAc9jqero1lu1teHh1MfPNpoMOI/OWmvm9Qhgm3ing7LIvR9tyUt+q7R+WZQt1fYliVGejPbNHxEoK+nW6kWLtJtFprQ/rxlKvv8SKvgSJRxjarAP0XVN8A23nS4XQB949u3m+GPf7wZ9sAD1KqTgEbxB9xxh8l3v7Ji9t30tHkTsnUrDHvvaQiatzoF2OQHGR8fH4rKo9jHDzQBDIt02ngUp5uYGLw8wI8xUTsM7dLlU9bXB/PyrDm0UsAtRNLbBPxP30TQug7v0q/g4ny0zHQXUoGOlZgQJBsUHy8Nl5ajxI/L6lEnNqERmrayTqGcx1YTQxaakOU3tRJmg1c06XAAd6CwbMpcX5ZzpYzyqMBXNl1s0jBXq28+LYob/DjRtm2m//GPNxH9hx4yw4dFvlnSKHPHHQkcOTLQ8Cju+1+unScCH8CtIAGGlPPExEDg8yg8jdxTMc2FNUbnAYZztk9MDIbjMZ+w3FnGxgaNZTGnPQ7H4treLvCsOTYhT3cHXZY0nlZI+Dp9o/B8Xt8v5NLpbOVQmg+PNJfxIC8iAhoaZc1CTJWfPFDhWuTxbMr+SavQujz4PhHssgV/nRWpKiszKvBrIsZXh2knniTyeSSIChQUezMzZvjevUbUzc8bT/n/+T8AR46UsimNZH7eWOwxW87cnNl33S7A+Nipc4WqbBr65iFuVKenlPhYd7Jv+UHbzNraIDqN0XmsBHCbDcBw5B673a6x8nS7slefe/Dpm4HV1cF6aKNTGvXnUfaJCTNscnK4coG/Tsfe5RWWEFHNo/h8Xl4Jsb2BkN5E2NbvWzlQ4iat0V8WXPYUOj6LRUeKusb4vIqFNkbeQ5AsaVJlMY9FpyqKOs+zWHTKQAV+IKNykyviRMcoPubHP+MM0z3nHCMmH30U4BvfMM4TXOsw7b5pPvxwAocOARw7Zr4VgPb6sTGzz/DXF3o0ZA4wrI5xPO3ShraWiDy35AAMf5xJ8tbzOgbOQ/31Um77lZXhdVChTysfCC0TtdVQyw3vTk4OpqEinpKWKjONNHEtVXBwvbZKQNayKHHg48GPSQSGitJRee4p5RHT+U+xvYkJta3ltbmpB78iRvlmZmu9nta4VjoxUcxMTw+Lw/PPN9Fp9Od/61sABw6UvGERcv/9AAcPGlHf6w2+Aow/jDybxq4dmMRw/tiYmQHD7lzkUxWJDW031mFiYnxISNM0jWTSoeFoqaHYbDA4/draoGj0h5YcrECsr8PQB5+kCD6FCnafSD4dRyslPpF7XkGRPPjSfuAVJ6yMuP5LP1pGfkht60/bJqU8fDz4dfmMXR582zxp0/jS5mdp3m1rS0Ngl0WHvw1yvXWKjdD2HVltS4h68COjyRdlEaQ9sKgAwdSZW7YMGt0uLRmLzsKCTeC305t/zz0JHDhgLDkLC2YYetJ5Q9AkMeJyZQVgcrY7UNCo+FA9SwZ2pvZQOPLGtjzSzBvVSvYWycZDRToH5+NWIAB3BJ9vCrXX4PbgvCjkpSg/jaKXgSS++X6zHRqerpRbnSi2txJKvMTwnPAVD0WLjDoyCtVJlkrcqOwbxaAe/Apw+RZd/a7lhR60WGvuLo99WnRH2g8o3GZmBlHU6enBF2/37jW58n/wA/MxrLbzne8YcY+iuNsFmJoy+wbF9drawLKDmWxWVjrQ7U7C9MxppjLQWxlk1qGpMwE2K8qNDeiMjcHERKfvuUffPI3U83oCdmmdAoU1DsNKAGbk2dgYLhLtp1H71VWzbLpeAHfGGh7ZRlHP/9MIPhX9dJtspH0Nl8IrPpOTg3LQKD19i4DD8T+P5vuUMQSN6ldLGVFKyXPv83wK9eC7spvE+KxqA7HqgBBsdpQyG2yXTZ7jomkylZGBipbxcSNm19dN6sedO43wO+88M+7gQfMbptnefEx9ubwMcOiQeWuxuDhsLUGRu74+EMy93kBs8tSS4+MAW7ZMwfjExLDIlxQ6U3g0ik/fFmB0XfKM86w4+MaBdmm7XqTTgaHsPbgOLBoui0b/bRl4cHm4DVy8S0K/bHFP4dlzXPYbKZovlZVvh2InNtEQW3l8aWq5FUUZRgU+pEfmRxFbmsw0bA8HKrzGx09lhjnV3bbNRPfPP9987fbhhwH+6Z/wi7fN52tfM9vS6w0aFGNFZ2bG9KOAQ6E7Pg5w8qQZjpF2jPKjP9949ceh291iGuTOnMq2g0KfckpFctsUgIk6oy+eim7sUpFOjyOKe/TX00j96urgTQQKeB7Bd3nv01JU0kqIJIhdoh7LTrG9PcAycXFN6000Ww+ttKX9xsY2txOgFRWf7Xc4sqzD2kos2UxCPe8+SPPboqKuSKrvMqT+PNvQ9uco91+73oDwZ2kb9o1te2we/NiQypXnuFTdmNbGSAn8WB4Ao4BtX6OYWV83fnwUVGecYcYtLJjxT3iCaYR66JC0dHrxxHk8b7ghgV7P2GwOHDBdFLedjhHn+IVXLu5RMGMayV7PTEP7u13jy6eNc43o78DY2GQ/cxG9saLI5tg+eIXinXryAQbDuMikOe1RRKN1h6bHpMKeZt2xiW4JVwQct8k2nY+45w198bjZou/8jYIrgi9NJ20ft+3Q7VKGifm+XlbZirDolPlMbINwVYqhDTakJuIt8NP86bHjymzgM13oOrI0aoqlVX2WzAs+06LAGxsz0Wgawe92jcifnDRR/G3bTOrI//5vgPvuM78m8aUvDTcaRUE3O2u2cetWY9lBr/2WLWYfUG88je4CmH7cZ/jDnPnYpmF2dhDdN/N2Ntk7UHBTMALNbTZUZNIUmTRij55+7Ed/PY/gc2HPG+gikrDP+tEpF1zYS28QbMujbzN4VH5qahCdxzSxriw/GLmXvPiI9EYiLXo/qtRx/7RlFysK3+eX7zg+na386sHPh+0NSN3P+DLxfVsUO2UdI9/9kKdNKOIt8JvYSMJG7BdWTOUr+oFFhSIVr/gxrB07jEBaWjKpNJeXzbiDB22WHdu+qub8fNKTEpieNpF0zIgDYL7WKwm7JBl8FwDzxuPHolAQopDnXvLx8UEjTiryJycHFYSVlWEbD68sUKjAlVJhcg8+TX/JKy+0S9Nl0gi+9MP1A9hz6NuGZZ3WR9hz+IepbNF6qTEtTkePp03M0/683nsV/dVhs2i0CW73iSUgFTOjvL+afh0UeYyKuD9k0eCNtuik7ShpfKwXVkzlkt5qlC3yaQPPpSUjTI8fN5H8884zmXYOHTK2nTvv9I/olo0r68/0tCn/9PTgi7SYvh67y8tG7GEEHkU892UDDIZjKnwU91NTA0GP65mZwYa4A6GP+xlAFn88ks6/cku/ZWBy8w8fS/TkY677bndzBB9AjuBLXf6f9tsqBbZ56XifeXEf0XYJvCEvrXDxH03daZsOl4MVMFyny5qj0ftmwO+hRXjwQ58Rtu+YuN40Z/Xkp21fTM+3qgjJdtQm0toXNFn0F4XrjVyR36FotMCXxLvPsBgZlVo9QkUJtX7MzRlhuL5u/gMMuqefPvhA1ve/L2XbodS/L5eXzXZhNhyedQYFMAr9qamB2ENRCLDZvsGFIrfsoNCnXRxOxSVGh7mvmwpgW0WKZuIBGCyPZuHB4TxqTyP9dBk4TOrif7o+Ktaxn5P2psC3Ma+UpYdWxNA6xStgOB0V9LTLff28m2bNSSu3Ug9FPXOKsozyxo58eNblx2gxjYUm6I4q0P2QjSK0q5fAz3Kx1nVQpRtZU242trcNdd0wXest4uSjogzF09zcQASvrJhhJ04Ygbp7t7Hw7N5tMu48/LCJ6t95Z65ilMrS0kDgd7vDXRrJ73ZN1hzJo40iD6PvNLKPlQHMsDM1ZYZRgU8j+VTo07cFku8bsUW4qcjnlResnFARzz33dHq6fMwiZIuw83OGCn1aZluWHirsbQKf7gup0SytKKGQ5+Kdf4SL5r/nVh26Tiru05COmYr7ammKgPG5l/t48OlzwdWwdxRxnQujEsVvyvXQBCQ9G6IHvQR+VVkAQufNMi5msjSkKhPXjb+oc0KK5AMMhGiSGJvL1q3G4z4zY76Eu3s3wJ49AMeOmVz6R46YqD71wccCTRNJG69SuwxG8ql4RPsOTufK906j+VTw8wg+RvqpbYcKUS72fUW/j72GV+j48nA8VhzQvy+tl+5XKfUmbehrK6OU55574/nHq7jdiQp4W+ResulIb2Vwv1DfvS1rj7QPlXpoig1BEgZFWnK0Qa6bUdkntu2M8ZqIAZ9KMm//4rsva7Po8AhAyMFvwoXSFGtQCEVvEy6P+7hRJKGo2tgY+Mw7HSNat20zQn9xEeC004zQ37fPdO+910T2l5YKK2ouqOhEgY/edCqm6T7ABqougS/1o80HBf7y8kB8ol+fWnq4+KS+cJ6T3SUoQ0S0ZKexWWzS1sf9/DhuY2MwHPe5Lcc9gD1aLlmjaPSde+ttkXu6f2nknu9fLu75PkpDRX8cZMmoVjQ2D36Z6+PrVsFvGPXtByi+XUpZ1G0xs907suyvUiw6aa3G8+bjrfsA+BIqiGNuEFzGxUjPk7GxzctHsYJfvUWRj/nljx83tpYdO0x3715j6znvPGPrefRRUwHAr8ceOVL4JliRGl7SKC9vaGkTZjy3PAp/hFs8MOp/8uRmPz+N7GO2HhqRphl3aPlcYt9H9PP+tGw23MpD/0vCnlaeaEQf+1Hg0/JI5abbiCKdRuN5ykv+JoS+PcFKKla2qFffN3Jv28+2fa7ivj5iFCsum2rWtJ6hWWFieoaVge9+HBV7jo0Yrw8bMR2fvGUpJYIvNeRx9StumlKhKQIezadRURREvd4g//vSkhm+tDSIWM/MmO6ePUbozs8bcX/okPm/sGCE//JyvnJKw6h4l0Q8t3FwWwxfLo3i4n6hX1fFjDU47djY4GNZ9Au0+JEs7FKRjxUAmmcfRa1kCcJtlRropll6AOQPSkniX/rPI/ZUvEt2HfxCL102Pcc4VODTY8ffcNiy5fD5+L5zRe6RLI1qlWGqvGeWaWGtglAPvjT/qNMkAauMDqV68EPTQ/lGE5p0QwmxIMV4A63CasSXT4UNCrG1tUGkdHLSCPzpaTN8585BHnraPXbMRPRRzC8tmd+JEyb6v7xs/q+vm+EbG4MGsfwHINtKaFcaRhvGYtdlsXF9gZVjyxqDXRT7uAy6PikdJwp7FKo8Hz//OBMXqZK1xDfCn0X0Y5dn/ZHEPq6D7l/uc5cazuIHw3jbBtsbEFoxopU4arei+43ar2hZpP2nkXt/qrxnNsFzbCtLVg++a5mIWnTqQ7Jn1Xk80tYX27US+/kajQfflp5LUXzg0VYqZrhdAmDQWHVlZZB3fmbGdNHag5WA5WUj/tfWBkIfBT79Eqstb7stAkyR0iBKlhebuLchedyTZLC/pOlR8GNUO0nMejG15erqsLDHtwA0mw8V+Hw70uw7vN9m4aH/XcN8PpCF68XGuxS6v1GE83YIPGMRffshWbBQ4NN9ws8Bvi98xL0Shj5zlLKI/dzyEX+8AhDL9sTQdjGWfeHCdz+V4sHPM38MB7guYk2TWfcxkYSjFKnFiP727WbYyorpX142wp3+X142/SdPmmmXl43AxXlQ6FOBj/9RWPL0jDzjC4U34sxjbaGRahzObSlSSkhaVtyO8fHhj1YB+L11kKbB7bRZTLIKVtebCht0XTzXPm1nwCPrPBqPEXwayeeZdLjA529C+BsPn4qdRu7jp0nWnDZ68G1R6irWE7NGcVmi60xrKu1HaZq6aIKwp/iUt7YsOi6yZNZpM0078crAJ1qOwyYmBhFbjOijaMePS6Hg7/U2C3tMZ4k/2oDTJ/0iImWQoXCRb5uPi3VJ6HORT7P14DJRhKeJY7p+tEfRKHSSDCoHvPKAKS5xXoBw604ebHYp3sDWp40ERvBpJhybsOcZdiS/PY/q033i+9ZDiYe6Ax9FEFp+lzCrMyDVhmPRdlyVCz125eAt8F3J9qUaPe/PcsG36aL1OZFjEvIxRypoBBRgIGpR2KOYm5ranE2FWm/oMLStuMQ9Ffn8x6PmtGx8WFq/ZEGh20XLQd8uAGy2FuEyaKVE8qfbRD9/gyB51/GYAMiWJByO2KxIkph1vfmwvSHgUXHuwacCn7dFkDLk0Mi9S9hLjWuLjtqnjRt1qhYMrnt21jKUIZDLzIIm9fv483G6rNuqyTsG+ETHXVRdOQs9dnm3z4c2thfxFvghr330wsuGeuGy4xI9+MEkFGNSakXapT51KqbpfwA5Sg4g52WnXdt/l7dcEvhUpPPKh217bHYeunyERvJtliJXlJ6Cfn9fpAqBtF6pYkHH8YogtSFR0Y4VF5rikjeclT5cRQV9UZYcFyru46Wo+2VZFp0qRJK0TvyP602zjMRITM9miSzRcekZH8s2Vn2uIrFsf1FEadFpM1ny5cZ+c6kbLpRs0XGXb52Ldkn8SlF7gM0Cmf+nXS7epWlsy+IVCqnywcfRru1NhPRGQCqXD5JotYl1aZ4064pN0EsWGC74KTxSTyP43KrDffV5hb20PdK+8BmubKaqjzmVQZn3eZsIzJMlDyDMgx+yfUXsa1smP18HQshbiVhI2+d02+vUFWltKNJ0j69jZJT1U2MEftMiyi58tqWJUY5YkDzXfBx20baBopbbfPh/WwNWSdTT/66POaV1cV7auJf+qP3GFcmXIvp0nK3S46qU2PY/Jy0iT/9zwU/FPo/O2wQ9FeA4jr6RSIvG8w+S2XLcY4NdgM0Njl1ee9d/n/2p2CnzWWFrrNoEJKGTp/xp4tcWwef9aWIsTXinpX9MG9a256tPhQX3c1F26iz4ukBCrmebtbgp1uiiiV7gt3Xnx/h6rEkPqzS4KKL2DOxubGxueMpFLUA2UY64Ujn6Lkfyy/OfLXIvfdXVZj1aXR0uM/+IlGvfhGDz1dvEPhf2vtF7KsZxXtwGHE4j+DQNKI/O+wh79dpXT92N9Ir24JclsOpoU+WqAPBxeYNeRe+zup/FofhU3nzbTMQY8S6zQhabDiuS6AX+KFH3idbk6FQaLsGP46VKAJ0esQlcSbyH/ndF+tPeKkiCnzeutbUpcM3jekshRfdDCPHa+wp9mv6SC3y6fBqht32AjFcSeANaHEaXn9eOkzZu1InpbW6R5ajSnlPH/uMRY5fApOQpq09k2mYPaRJ5M9TUXVHOQ5vfxuRFBX4DqLNG3eQL34WP4MfhfHqX+EdsqS759CH/qV0nLaKf1n5AyggkefslgZ9mN0rbN4hL8HL/Prfp+Fp0cF6a2QePNRX4fH7XtwBslY4iUl+qsN9MWwV9DBS5b21WGdvzq4rjGuKhb8uxdbWHsFmbYiPGtwhNpFECP+YTMgu+J3BVJ7rNqzcKhAgrOi2vGCDo5ecUURnw8etLqTVxGp8KgJRdR4rgZ/Xp2+D+ewC7Bx9gs9/dJ/c8wgW9LSqfRdTz/1K/7zglnLLuma4Gi9gfK2VHqkOi8FW1kchCzMfQB1v5fS06MaDivhgaJfCVaojtYq8TSXhR4RoqzPibAb4MLor5R6f4dL6in7Y1kH4Ads+95MFPa5dg2x4fbF582pUa3/qIfo7NmmOzCNHlAqgVJybqEAUuL3is99FYy5UVFYMa8XaRljGpzfutcQK/rQfDlau2KlzRqbY9FPJQhCiz2Xz4cOlNgKsSkCdLj2T7kf7TdboaD9v6fUlrmCo1YvWJtlOkiD9fHp83b7TeZ7wSThUPa1uGkrxUcZ9vQuUjC23alqxIWWeKOEfbIICrbJwdG40T+G3FZj9qwwU2qqRF/32n9RGLdLmY+pPOm2b1kQS8b2XBNgyXY0NqmGsT4YgUJbeJe1ye7cu5dJ40YW9L7enT7ztOyYfeJ/2hIlAFsqK0ExX4EVH3A8pWwVCKo6zoPx2WN+pP//s0/LUtly5bIqQBLpIln77NOsOnzyLmXcv1Ha/kp47c3T7rjekt6Cjc3/OcB23ZF6PWfk6xowJf2YRGdeLAZuNxTecb9efQqH+SuCP/Ulmwn39XQJomC2nCOvQ/RYV9c6g7CJJGTIJeIuayFYHrg1eKMmqowI+IkHy1ZeDyqrX1gdAk8gjGtMw/vll/bJYbKVLvK+izRPJd420f0qLz2Bo5p0X5s5RRCSPmAIPvV0KleWzE8OXQWPd3Vtq2PSEUcWxtH75Sy7CdGPeTCvyIiOGEUJqNT9Tft4GvVBHg4phDKwBcWAPIFYFQgSz59F3r9Im+a4Re8cH1QSE+LHaROaoe/LY2NpZo+/bFRIwf3FKBr/TRm0H7KCrqD+Bn/8krerNajELKIFVSfNangr4amnQfalKGjrZ78LPs77Sv2xZNFedE0WWPUbgWRSyR9rJQga/0saXJbNNDQNlMFq8/pwhhHrK+vMvxXbaK+jiJ9b4kffgqlhzc0rpi3IdZ4dsSo3Cr4pgX/YYiRutJUeRtlB37vlCBHzF158bH9bT9waAM4ytq83zwK++6i1iHj31JqZ5YxbuEJOB5YCTmyHmb7+1Zn59t2X6AYq8lW9uS2EVuFbja5mC/reJfJirwIyaGC6dNNzulWELEcJ4MOqFodL7ZNOmeE2LRoQ/1mCoxsZSjSPI8O8vy6MfwPM8DP2ebvj1FkrZv6rLvqcBXNtHGG75SLyqmlaJpmsCIwaPf5nt7GfuzqLSibTnWTbvm0igykh7jvlGBr2zC50Rt84NCUZS4iCninZUYPLsh6+cRydj3v1S+osVb1n1Qx7Ev8k2E1L6h7jYlRVBUeUMtOlWhcbWICc2nXCUdSPo/RRlV6HWg18Lo4bpH41nhO29s2NpgNek8j30fV0XRx8y2X/n57zPPKFLVvtAIfsTEcGNtUrqwGG8gdR2ztn+xMg9lRiObEOlUisN1j47FhxuC7cNdTckdX2WGGoD6P07pS1ltCWL0ndeBevCVkaXqrD9IHQ+iGG5iLpEhPcBHkSK2P4ZjrSh50Ewo5RDb/TW28rSRGK8hFfgNom4fX9OoKordpGi5iv8BZW1nk84HZbQJfQNhy+sfy7ledS58WzQ7lv1RBmnWmyypIZvo36domkwlN02+AOrEtd+yPKRsAq7px4c3nGrzQ0pRlHZTRzDMJvQUN01/dsZKqxvZ6sVVPLzhWNP3MW0cyRtM2n7SvG1BitpJ2x0bWctU5rbEuJ+aStPvMz64GujSftc4vpy0/jzTpg2vg7obvDfhXlk0tv0dowc/5HxPm5eSpg3q2hetFvgxnFBtYxRvYEp88IoZ7+p52S5G4Xj6NsyzVcJDl+Mzrc+4UTg2ih1btqiYKn5InsavTTzP1aJTMWVYIJp44inNIta3NmnixcdGxadLuzZDrrfYrU6xl29UySNEqiwHHR7DedQ222QT4c8Kl3Wpbo96UeTZvjL3hQr8itHot9JEYvXn+9w4fZbh6k9bZ5OJ5Tim0ZRylkXTBVCV6H5xU8f9Wwqi8PGu/tjJs31l7gsvi05WL2DdxFQWRWkavr7KOm7G/D4kdW3zFbV+pRpsr//bdgzSnqWu8z2rJz/Ukxy7D79pwrCNSPZJ/r8uimzbUlelJORa84rgh3gBYyKG8uS98cWwDcpoIkXsQ6PdVUDvR2mRzTSLjk/0X6/JdlNXdDzEg0/LGBoBzBM9jN2io6RT9nGSLDr8f13UFUnPQ6UWnboPUOwUffFkWZ4eIyUrMdyE85A1YpR2M2/SfslTzthFmu045Dk+afO6REpZ50WsYsOnDHWKffXgx4ktMNRkD35Rvvq0afPgJfBthQltWNBmyr6hjcp+VJRQQq89Sbzz8TZbiDR/bMQu0vNS5/aVKbTLEghlPYdt10NMx0eJg1g9+Hmuq6LeBoRse+i15R3BD/Hx0WGxvaIpkjIe+D42A0Upg7YLQyRtO33GV319SgEVzqgcv1EFzzuf4+w6R/NUDvTZ1HzqesMSg8XFpVV9+uu692fFW+CH3ABshWrijSHt7QUdXtT6ilhmE/d10wmNBvD+oiJ0vuu1RVRi9NkXQZOvCdv9JqslyXddsePj90279uj0PDBVpbUg7VjisLzR0KyRRp8y1o1adGTK0iuh667jeMR+znKKvKcU/qGrJuzAvDThAZhAp/9TspNWm7eNz7PffSN0Wectcr6Y6QxdBe2/LxVF0/dVyDXYlPO+LguD0myacn6PMmXpCACATpIkzjvF1NQU7Nq1K9dKFEVRFEVRFEUphkcffRRWVlas41MFvqIoiqIoiqIozaFwi46iKIqiKIqiKPWhAl9RFEVRFEVRWoQKfEVRFEVRFEVpESrwFUVRFEVRFKVFqMBXFEVRFEVRlBahAl9RFEVRFEVRWsT/B+qM/R35P+PwAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Vorticity plot at timestep: 1400\n", - "\n" - ] - }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Vorticity plot at timestep: 2100\n", - "\n" - ] + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ - "Vorticity plot at timestep: 2800\n", - "\n" + "19:50:37 C02YR4ANLVCJ SmartSim[54122] INFO fv_simulation(54161): Completed\n" ] } ], @@ -376,75 +326,209 @@ "cylinder = client.get_tensor(\"cylinder\").astype(bool)\n", "\n", "# plot every 700th timestep\n", - "for i in range(0, time_steps + 1, 700): \n", + "for i in range(0, time_steps, 700):\n", " client.poll_dataset(f\"data_{i}\", 300, 1000)\n", " dataset = client.get_dataset(f\"data_{i}\")\n", " ux, uy = dataset.get_tensor(\"ux\"), dataset.get_tensor(\"uy\")\n", "\n", - " plot_lattice_vorticity(i, ux, uy, cylinder)" + " plot_lattice_vorticity(i, ux, uy, cylinder)\n", + "\n", + "# Use the Experiment API to wait until the model is finished\n", + "while not exp.finished(model):\n", + " time.sleep(5)" ] }, { - "cell_type": "code", - "execution_count": 7, - "id": "7bbce88c-6f63-407a-8912-5787139f015b", + "cell_type": "markdown", + "id": "d9999bd1", "metadata": {}, - "outputs": [], "source": [ - "# Optionally clear the database\n", - "client.flush_db(db.get_address())" + "## Post-processing with TorchScript\n", + "\n", + "We can upload [TorchScript functions](https://pytorch.org/docs/1.11/jit.html) to the DB. Tensors which are stored on the DB can be passed as arguments to uploaded functions and the results will be stored on the DB. This makes it possible to perform pre- and post-processing operations on tensors localli, *in the DB*, reducing the number of data transfers.\n", + "\n", + "### Uploading a script\n", + "We can load a file containing TorchScript-compatible functionsto the DB. For example, the file `./probe.script` contains the function `probe_points` which interpolates the values of `ux` and `uy` at some user-provided probe points. This is useful when we are interested in the value of a given fields only at specific locations.\n", + "\n", + "The script looks like this:\n", + "\n", + "```python\n", + "def multi_unsqueeze(tensor, axes: List[int]):\n", + " for axis in axes:\n", + " tensor = torch.unsqueeze(tensor, axis)\n", + "\n", + " return tensor\n", + "\n", + "def probe_points(ux, uy, probe_x, probe_y, cylinder):\n", + " ux[cylinder>0] = 0.0\n", + " uy[cylinder>0] = 0.0\n", + " ux = multi_unsqueeze(ux, [0, 0])\n", + " uy = multi_unsqueeze(uy, [0, 0])\n", + " probe_xy = multi_unsqueeze(torch.stack((probe_x/200 - 1, probe_y/50 - 1), 2), [0])\n", + " u_probex = torch.grid_sampler(ux.double(), probe_xy.double(), 0, 0, False).squeeze()\n", + " u_probey = torch.grid_sampler(uy.double(), probe_xy.double(), 0, 0, False).squeeze()\n", + "\n", + " return torch.stack((u_probex, u_probey), 2)\n", + "```\n", + "\n", + "Note that we don't have to import `torch`, as the TorchScript interpreter will recognize it as a builtin module.\n", + "\n", + "We then proceed to upload the script to the DB under the key `probe` and add the probe points as tensors to the DB." ] }, { "cell_type": "code", - "execution_count": 8, - "id": "7d9f2669-4efb-4f38-97e9-869a070ab79c", + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "22:50:59 e3fbeabfdb3e SmartSim[1216] INFO fv_simulation(1337): Completed\n", - "22:51:01 e3fbeabfdb3e SmartSim[1216] INFO Stopping model orchestrator_0 with job name orchestrator_0-CI23BKDK76I0\n", - "22:51:11 e3fbeabfdb3e SmartSim[1216] WARNING Unable to kill emitted process 1327\n" + "Default@19-50-39:ERROR: Redis IO error when executing command: Failed to get reply: Resource temporarily unavailable\n", + "Default@19-50-40:ERROR: Redis IO error when executing command: Failed to get reply: Resource temporarily unavailable\n", + "Default@19-50-41:ERROR: Redis IO error when executing command: Failed to get reply: Resource temporarily unavailable\n", + "Default@19-50-43:ERROR: Redis IO error when executing command: Failed to get reply: Resource temporarily unavailable\n" ] } ], "source": [ - "# Use the Experiment API to wait until the model\n", - "# is finished and then terminate the database and\n", - "# release it's resources\n", - "while not exp.finished(model):\n", - " time.sleep(5)\n", - " \n", - "exp.stop(db)\n" + "client.set_script_from_file(\"probe\", \"./probe.script\", device=\"CPU\")\n", + "\n", + "probe_x, probe_y = np.meshgrid(range(20, 400, 20), range(20, 100, 20))\n", + "client.put_tensor(\"probe_x\", probe_x)\n", + "client.put_tensor(\"probe_y\", probe_y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then apply the function `probe_points` to the `ux` and `uy` tensors computed in the last time step of the previous simulation. Note that all tensors are already on the DB, thus we can reference them by name. Finally, we download and plot the output (a 2D velocity field), which is stored as `probe_u` on the DB." ] }, { "cell_type": "code", - "execution_count": 9, - "id": "2bca8a25-6e1b-4540-9d1e-932eb52d7b1e", + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { + "image/png": "", "text/plain": [ - "['Completed']" + "
" ] }, - "execution_count": 9, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "exp.get_status(model)" + "ux_name = f\"{{data_{time_steps-1}}}.ux\"\n", + "uy_name = f\"{{data_{time_steps-1}}}.uy\"\n", + "\n", + "client.run_script(\"probe\", \"probe_points\", inputs=[ux_name, uy_name , \"probe_x\", \"probe_y\", \"cylinder\"], outputs=[\"probe_u\"])\n", + "\n", + "probe_u = client.get_tensor(\"probe_u\")\n", + "plot_lattice_probes(time_steps-1, probe_x, probe_y, probe_u)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Uploading a function inline\n", + "In some cases, it makes sense to define the TorchScript function directly in a Python script or in a Jupyter notebook, like in this case. Let us define a simple function which computes the norm of the velocity field." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f43cc359", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "def compute_norm(ux: torch.Tensor, uy: torch.Tensor):\n", + " return torch.sqrt(ux*ux + uy*uy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then store the function on the DB under the key `norm_function`." ] }, { "cell_type": "code", "execution_count": 10, + "id": "48ba5e40", + "metadata": {}, + "outputs": [], + "source": [ + "client.set_function(\"norm_function\", compute_norm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the key we used identifies a functional unit containing the function itself: this is similar to the key used to store the `probe` script above. When we want to run the function, we just call it with `run_script`, by indicating the `script` key as `\"norm_function\"` and the name of the function itself as `\"compute_norm\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9b556de6", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dataset = client.get_dataset(f\"data_{time_steps-1}\")\n", + "client.run_script(\"norm_function\", \"compute_norm\", [f\"{{data_{i}}}.uy\", f\"{{data_{i}}}.ux\"], [\"u\"])\n", + "u = client.get_tensor(\"u\")\n", + "\n", + "plot_lattice_norm(time_steps-1, u, cylinder)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7bbce88c-6f63-407a-8912-5787139f015b", + "metadata": {}, + "outputs": [], + "source": [ + "# Optionally clear the database\n", + "client.flush_db(db.get_address())" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7d9f2669-4efb-4f38-97e9-869a070ab79c", + "metadata": {}, + "outputs": [], + "source": [ + "# terminate the database and\n", + "# release its resources\n", + "exp.stop(db)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "id": "50b42065-6356-4a5a-b742-daca17b8bd6e", "metadata": {}, "outputs": [ @@ -453,34 +537,26 @@ "text/html": [ "\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", - "\n", + "\n", + "\n", "\n", "
Name Entity-Type JobID RunID TimeStatus Returncode
Name Entity-Type JobID RunID Time Status Returncode
0fv_simulation Model 1337 041.7211Completed 0
1orchestrator_0DBNode 1322 083.3787Cancelled -9
0 fv_simulation Model 54161 0 38.1561Completed0
1 orchestrator_0DBNode 54134 0 66.5750Cancelled0
" ], "text/plain": [ - "'\\n\\n\\n\\n\\n\\n\\n\\n
Name Entity-Type JobID RunID TimeStatus Returncode
0fv_simulation Model 1337 041.7211Completed 0
1orchestrator_0DBNode 1322 083.3787Cancelled -9
'" + "'\\n\\n\\n\\n\\n\\n\\n\\n
Name Entity-Type JobID RunID Time Status Returncode
0 fv_simulation Model 54161 0 38.1561Completed0
1 orchestrator_0DBNode 54134 0 66.5750Cancelled0
'" ] }, - "execution_count": 10, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "exp.summary(format=\"html\")" + "exp.summary(style=\"html\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "326352c9-d10d-47b5-a3b3-032b033b55f6", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -499,7 +575,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.10" } }, "nbformat": 4, diff --git a/tutorials/online_analysis/lattice/probe.script b/tutorials/online_analysis/lattice/probe.script new file mode 100644 index 000000000..ea5de7248 --- /dev/null +++ b/tutorials/online_analysis/lattice/probe.script @@ -0,0 +1,16 @@ +def multi_unsqueeze(tensor, axes: List[int]): + for axis in axes: + tensor = torch.unsqueeze(tensor, axis) + + return tensor + +def probe_points(ux, uy, probe_x, probe_y, cylinder): + ux[cylinder>0] = 0.0 + uy[cylinder>0] = 0.0 + ux = multi_unsqueeze(ux, [0, 0]) + uy = multi_unsqueeze(uy, [0, 0]) + probe_xy = multi_unsqueeze(torch.stack((probe_x/200 - 1, probe_y/50 - 1), 2), [0]) + u_probex = torch.grid_sampler(ux.double(), probe_xy.double(), 0, 0, False).squeeze() + u_probey = torch.grid_sampler(uy.double(), probe_xy.double(), 0, 0, False).squeeze() + + return torch.stack((u_probex, u_probey), 2) diff --git a/tutorials/online_analysis/lattice/vishelpers.py b/tutorials/online_analysis/lattice/vishelpers.py index 69f1ed05a..725c690fd 100644 --- a/tutorials/online_analysis/lattice/vishelpers.py +++ b/tutorials/online_analysis/lattice/vishelpers.py @@ -3,8 +3,7 @@ def plot_lattice_vorticity(timestep, ux, uy, cylinder): - - fig = plt.figure(figsize=(12,6), dpi=80) + fig = plt.figure(figsize=(12, 6), dpi=80) plt.cla() ux[cylinder], uy[cylinder] = 0, 0 @@ -13,13 +12,62 @@ def plot_lattice_vorticity(timestep, ux, uy, cylinder): ) vorticity[cylinder] = np.nan cmap = plt.cm.get_cmap("bwr").copy() - cmap.set_bad(color='black') + cmap.set_bad(color="black") plt.imshow(vorticity, cmap=cmap) - plt.clim(-.1, .1) + plt.clim(-0.1, 0.1) ax = plt.gca() ax.invert_yaxis() ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) - ax.set_aspect('equal') + ax.set_aspect("equal") + ax.set_title(f"Vorticity plot at timestep {timestep}\n") + plt.pause(0.001) + + +def plot_lattice_norm(timestep, u, cylinder): + fig = plt.figure(figsize=(12, 6), dpi=80) + + plt.cla() + + u[cylinder] = np.nan + cmap = plt.cm.get_cmap("jet").copy() + cmap.set_bad(color="black") + plt.contour(u, cmap=cmap) + plt.clim(-0.1, 0.1) + ax = plt.gca() + ax.invert_yaxis() + ax.get_xaxis().set_visible(False) + ax.get_yaxis().set_visible(False) + ax.set_aspect("equal") + ax.set_title(f"Velocity magnitude at timestep {timestep}\n") + plt.pause(0.001) + + +def plot_lattice_probes(timestep, probe_x, probe_y, probe_u): + fig = plt.figure(figsize=(12, 6), dpi=80) + + plt.cla() + cmap = plt.cm.get_cmap("binary").copy() + cmap.set_bad(color="black") + plt.quiver( + probe_x, + probe_y, + probe_u[:, :, 0], + probe_u[:, :, 1], + np.linalg.norm(probe_u, axis=2), + cmap=cmap, + scale=7, + pivot='mid', + angles='uv', + width=0.003 + ) + plt.clim(-0.1, 0.1) + ax = plt.gca() + ax.invert_yaxis() + ax.get_xaxis().set_visible(False) + ax.get_yaxis().set_visible(False) + ax.set_aspect("equal") + ax.set_xlim([0, 399]) + ax.set_ylim([0, 99]) + ax.set_title(f"Velocity field at timestep {timestep}\n") plt.pause(0.001) - print(f"Vorticity plot at timestep: {timestep}\n") \ No newline at end of file