Skip to content

Commit

Permalink
Remove references to limit_app_cpus and throw exception
Browse files Browse the repository at this point in the history
  • Loading branch information
ashao committed Jun 22, 2023
1 parent e2edfb4 commit ac466c9
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 140 deletions.
40 changes: 38 additions & 2 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
import json
import os
import inspect
import shutil
import pytest
import psutil
import shutil
Expand All @@ -44,6 +43,7 @@
from smartsim._core.config import CONFIG
from smartsim.error import SSConfigError
from subprocess import run
import sys


# Globals, yes, but its a testing file
Expand Down Expand Up @@ -113,7 +113,8 @@ def pytest_sessionfinish(session, exitstatus):
returning the exit status to the system.
"""
if exitstatus == 0:
shutil.rmtree(test_dir)
# shutil.rmtree(test_dir)
pass
else:
# kill all spawned processes in case of error
kill_all_test_spawned_processes()
Expand Down Expand Up @@ -591,3 +592,38 @@ class MLUtils:
def get_test_device():
global test_device
return test_device

@pytest.fixture
def coloutils():
return ColoUtils

class ColoUtils:
def setup_test_colo(fileutils, db_type, exp_name, db_args, launcher="local"):
"""Setup things needed for setting up the colo pinning tests"""

exp = Experiment(exp_name, launcher=launcher)

# get test setup
test_dir = fileutils.make_test_dir()
sr_test_script = fileutils.get_test_conf_path("send_data_local_smartredis.py")

# Create an app with a colo_db which uses 1 db_cpu
colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
colo_model = exp.create_model(f"colocated_model", colo_settings)
colo_model.set_path(test_dir)

if db_type in ['tcp', "deprecated"]:
db_args["port"] = 6780
db_args["ifname"] = "lo"

colocate_fun = {
"tcp": colo_model.colocate_db_tcp,
"deprecated": colo_model.colocate_db,
"uds":colo_model.colocate_db_uds
}
colocate_fun[db_type](**db_args)

# assert model will launch with colocated db
assert colo_model.colocated
# Check to make sure that limit_db_cpus made it into the colo settings
return exp, colo_model
7 changes: 6 additions & 1 deletion doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Description

A full list of changes and detailed notes can be found below:

- Replace `limit_app_cpus` with `limit_db_cpus` for co-located orchestrators
- Remove wait time associated with Experiment launch summary
- Update and rename Redis conf file
- Migrate from redis-py-cluster to redis-py
Expand All @@ -37,6 +38,10 @@ A full list of changes and detailed notes can be found below:

Detailed notes

- Replace `limit_app_cpus` with `limit_db_cpus` for co-located orchestrators.
This resolves some incorrect behavior/assumptions about how the application
would be pinned. Instead, users should directly specify the binding options in
their application using the options appropriate for their launcher
- Simplify code in `random_permutations` parameter generation strategy (PR300_)
- Remove wait time associated with Experiment launch summary (PR298_)
- Update Redis conf file to conform with Redis v7.0.5 conf file (PR293_)
Expand All @@ -53,7 +58,7 @@ former began deprecation in May 2022 and was finally removed in May 2023. (PR285
codes. These have now all been updated. (PR284_)
- Orchestrator and Colocated DB now accept a list of interfaces to bind to. The
argument name is still `interface` for backward compatibility reasons. (PR281_)
- Typehints have been added to public APIs. A makefile target to execute static
- Typehints have been added to public APIs. A makefile target to execute static
analysis with mypy is available `make check-mypy`. (PR295_)

.. _PR300: https://github.com/CrayLabs/SmartSim/pull/300
Expand Down
24 changes: 13 additions & 11 deletions doc/orchestrator.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,22 +97,24 @@ Here is an example of creating a simple model that is colocated with an
colo_model.colocate_db_uds(
db_cpus=1, # cpus given to the database on each node
debug=False # include debug information (will be slower)
pin_db_cpus=True, # Pin the affinities of the co-located database
# to specific processors
limit_db_cpus=True, # Limit the database to a specific set of cpus
ifname=network_interface # specify network interface(s) to use (i.e. "ib0" or ["ib0", "lo"])
)
exp.start(colo_model)
By default, SmartSim will attempt to make sure that the database and the application
do not fight over resources by taking over the affinity mapping process locally on
each node. This can be disabled by setting ``pin_db_cpus`` to ``False``.
By default, SmartSim will pin the database to the first _N_ CPUs according to ``db_cpus``. By
specifying the optional argument ``db_cpu_list``, an alternative pinning can be specified (
following the format used by ``taskset``). For example, ``db_cpu_list=0-2,5`` limits the database
to be run on processors 0, 1, 2 and 5. Setting ``limit_db_cpus=False`` disables pinning for the database
kernel scheduler. For optimal performance, most users will want to also modify the RunSettings for
the model to pin their application to cores not occupied by the database.

Especially for multicore machines, it can be useful to pin the CPUs used by the database.
This is enabled by default and the database will automatically be bound to the first
logical processors starting from 0 to ``db_cpus-1``. To specify a custom set of cpus,
set ``db_cpu_list`` following the same syntax as the Linux command ``taskset -c``. The
RunSettings for the simulation should similarly be bound (e.g. in Slurm using
``cpu-bind``) to bind the application to a separate set of cores.
.. note::

Pinning _only_ affects the co-located deployment because both the application and the database
are sharing the same compute node. For the clustered deployment, a shard occupies the entirerty
of the node.

Redis
=====
Expand Down
15 changes: 12 additions & 3 deletions smartsim/entity/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ def colocate_db_uds(
:type db_cpus: int, optional
:param limit_db_cpus: whether to limit the number of cpus used by the database defaults to True
:type limit_db_cpus: bool, optional
:param db_cpu_list: lists the cpus which the database can be run on. Follows `taskset -c` syntax
e.g. '0-2,5' specifies processors 0, 1, 2, and 5
:type db_cpu_list: str, optional
:param debug: launch Model with extra debug information about the colocated db
:type debug: bool, optional
:param kwargs: additional keyword arguments to pass to the orchestrator database
Expand Down Expand Up @@ -262,6 +265,9 @@ def colocate_db_tcp(
:type db_cpus: int, optional
:param limit_db_cpus: whether to limit the number of cpus used by the database, defaults to True
:type limit_db_cpus: bool, optional
:param db_cpu_list: lists the cpus which the database can be run on. Follows `taskset -c` syntax
e.g. '0-2,5' specifies processors 0, 1, 2, and 5
:type db_cpu_list: str, optional
:param debug: launch Model with extra debug information about the colocated db
:type debug: bool, optional
:param kwargs: additional keyword arguments to pass to the orchestrator database
Expand Down Expand Up @@ -297,10 +303,14 @@ def _set_colocated_db_settings(
if hasattr(self.run_settings, "_prep_colocated_db"):
self.run_settings._prep_colocated_db(common_options["cpus"])

# TODO list which db settings can be extras
if "limit_app_cpus" in common_options:
raise SSUnsupportedError(
"Pinning of app CPUs via limit_app_cpus is no supported. Modify RunSettings " +
"instead using the correct binding option for your launcher."
)

# TODO list which db settings can be extras
cpus = common_options["cpus"]
print(cpus)
# Deal with cases where the database should be pinned to cpus
# (1) if the user set a db_cpu_list, but not limit_db_cpus
if common_options["db_cpu_list"] and not common_options["limit_db_cpus"]:
Expand All @@ -324,7 +334,6 @@ def _set_colocated_db_settings(
f"pinning to processors {cpu_list}"
)
)
print(cpu_list)
common_options["db_cpu_list"] = cpu_list

colo_db_config = {}
Expand Down
19 changes: 9 additions & 10 deletions tests/backends/test_dbmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,6 @@ def test_colocated_db_model_tf(fileutils, wlmutils):
colo_model.colocate_db(
port=wlmutils.get_test_port(),
db_cpus=1,
limit_app_cpus=False,
debug=True,
ifname="lo",
)
Expand Down Expand Up @@ -370,7 +369,7 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils):
colo_model.colocate_db(
port=wlmutils.get_test_port(),
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand Down Expand Up @@ -413,7 +412,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils):
colo_model.colocate_db(
port=wlmutils.get_test_port(),
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand All @@ -425,7 +424,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils):
entity.colocate_db(
port=wlmutils.get_test_port() + i,
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand Down Expand Up @@ -455,7 +454,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils):
colo_model.colocate_db(
port=wlmutils.get_test_port() + len(colo_ensemble),
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand Down Expand Up @@ -509,7 +508,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils):
entity.colocate_db(
wlmutils.get_test_port() + i,
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand All @@ -528,7 +527,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils):
colo_model.colocate_db(
port=wlmutils.get_test_port() + len(colo_ensemble),
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand Down Expand Up @@ -565,7 +564,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils):
colo_model.colocate_db(
port=wlmutils.get_test_port(),
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand All @@ -585,7 +584,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils):
entity.colocate_db(
port=wlmutils.get_test_port() + i,
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand Down Expand Up @@ -614,7 +613,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils):
entity.colocate_db(
port=wlmutils.get_test_port() + i,
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand Down
16 changes: 8 additions & 8 deletions tests/backends/test_dbscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def test_colocated_db_script(fileutils, wlmutils):
colo_model.colocate_db(
port=wlmutils.get_test_port(),
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand Down Expand Up @@ -204,7 +204,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils):
entity.colocate_db(
port=wlmutils.get_test_port() + i,
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand All @@ -214,7 +214,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils):
colo_model.colocate_db(
port=wlmutils.get_test_port() + len(colo_ensemble),
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand Down Expand Up @@ -268,7 +268,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils):
entity.colocate_db(
port=wlmutils.get_test_port() + i,
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand All @@ -278,7 +278,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils):
colo_model.colocate_db(
port=wlmutils.get_test_port() + len(colo_ensemble),
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand Down Expand Up @@ -315,7 +315,7 @@ def test_db_script_errors(fileutils, wlmutils):
colo_model.colocate_db(
port=wlmutils.get_test_port(),
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand All @@ -335,7 +335,7 @@ def test_db_script_errors(fileutils, wlmutils):
entity.colocate_db(
port=wlmutils.get_test_port() + i,
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand All @@ -358,7 +358,7 @@ def test_db_script_errors(fileutils, wlmutils):
entity.colocate_db(
port=wlmutils.get_test_port() + i,
db_cpus=1,
limit_app_cpus=False,
limit_db_cpus=False,
debug=True,
ifname="lo",
)
Expand Down
4 changes: 4 additions & 0 deletions tests/conftest/setup_test_colo/colocated_model.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
SmartRedis Library@19-19-36:WARNING: Environment variable SR_LOG_FILE is not set. Defaulting to stdout
SmartRedis Library@19-19-36:WARNING: Environment variable SR_LOG_LEVEL is not set. Defaulting to INFO
default@19-19-36:ERROR: Unable to connect to backend database: Failed to connect to Redis: Connection refused
Test worked! Sent and received array: [1 2 3 4]
Loading

0 comments on commit ac466c9

Please sign in to comment.