Skip to content

Commit

Permalink
Merge branch 'develop' into 478
Browse files Browse the repository at this point in the history
  • Loading branch information
ankona authored Apr 9, 2024
2 parents c61f478 + 505de50 commit 99f9b65
Show file tree
Hide file tree
Showing 10 changed files with 383 additions and 59 deletions.
21 changes: 21 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import os
import pathlib
import shutil
import signal
import sys
import tempfile
import typing as t
Expand Down Expand Up @@ -206,6 +207,26 @@ def alloc_specs() -> t.Dict[str, t.Any]:
return specs


def _reset_signal(signalnum: int):
"""SmartSim will set/overwrite signals on occasion. This function will
return a generator that can be used as a fixture to automatically reset the
signal handler to what it was at the beginning of the test suite to keep
tests atomic.
"""
original = signal.getsignal(signalnum)

def _reset():
yield
signal.signal(signalnum, original)

return _reset


_reset_signal_interrupt = pytest.fixture(
_reset_signal(signal.SIGINT), autouse=True, scope="function"
)


@pytest.fixture
def wlmutils() -> t.Type[WLMUtils]:
return WLMUtils
Expand Down
14 changes: 14 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ To be released at some future point in time
Description

- Update watchdog dependency
- Add option to build Torch backend without the Intel Math Kernel Library
- Fix ReadTheDocs build issue
- Promote device options to an Enum
- Update telemetry monitor, add telemetry collectors
Expand All @@ -33,10 +34,16 @@ Description
- Fix publishing of development docs
- Update Experiment API typing
- Minor enhancements to test suite
- Improve SmartSim experiment signal handlers

Detailed Notes

- Update watchdog dependency from 3.x to 4.x, fix new type issues (SmartSim-PR540_)
- Add an option to smart build "--torch_with_mkl"/"--no_torch_with_mkl" to
prevent Torch from trying to link in the Intel Math Kernel Library. This
is needed because on machines that have the Intel compilers installed, the
Torch will unconditionally try to link in this library, however fails
because the linking flags are incorrect. (SmartSim-PR538_)
- Change type_extension and pydantic versions in readthedocs environment
to enable docs build. (SmartSim-PR537_)
- Promote devices to a dedicated Enum type throughout the SmartSim code base.
Expand Down Expand Up @@ -78,12 +85,19 @@ Detailed Notes
undefined. (SmartSim-PR521_)
- Remove previously deprecated behavior present in test suite on machines with
Slurm and Open MPI. (SmartSim-PR520_)
- When calling ``Experiment.start`` SmartSim would register a signal handler
that would capture an interrupt signal (^C) to kill any jobs launched through
its ``JobManager``. This would replace the default (or user defined) signal
handler. SmartSim will now attempt to kill any launched jobs before calling
the previously registered signal handler. (SmartSim-PR535_)

.. _SmartSim-PR540: https://github.com/CrayLabs/SmartSim/pull/540
.. _SmartSim-PR538: https://github.com/CrayLabs/SmartSim/pull/538
.. _SmartSim-PR537: https://github.com/CrayLabs/SmartSim/pull/537
.. _SmartSim-PR498: https://github.com/CrayLabs/SmartSim/pull/498
.. _SmartSim-PR460: https://github.com/CrayLabs/SmartSim/pull/460
.. _SmartSim-PR512: https://github.com/CrayLabs/SmartSim/pull/512
.. _SmartSim-PR535: https://github.com/CrayLabs/SmartSim/pull/535
.. _SmartSim-PR529: https://github.com/CrayLabs/SmartSim/pull/529
.. _SmartSim-PR522: https://github.com/CrayLabs/SmartSim/pull/522
.. _SmartSim-PR521: https://github.com/CrayLabs/SmartSim/pull/521
Expand Down
10 changes: 10 additions & 0 deletions smartsim/_core/_cli/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def build_redis_ai(
torch_dir: t.Union[str, Path, None] = None,
libtf_dir: t.Union[str, Path, None] = None,
verbose: bool = False,
torch_with_mkl: bool = True,
) -> None:
# make sure user isn't trying to do something silly on MacOS
if build_env.PLATFORM == "darwin" and device == Device.GPU:
Expand Down Expand Up @@ -186,6 +187,7 @@ def build_redis_ai(
build_tf=use_tf,
build_onnx=use_onnx,
verbose=verbose,
torch_with_mkl=torch_with_mkl,
)

if rai_builder.is_built:
Expand Down Expand Up @@ -414,6 +416,7 @@ def execute(
args.torch_dir,
args.libtensorflow_dir,
verbose=verbose,
torch_with_mkl=args.torch_with_mkl,
)
except (SetupError, BuildError) as e:
logger.error(str(e))
Expand Down Expand Up @@ -496,3 +499,10 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
default=False,
help="Build KeyDB instead of Redis",
)

parser.add_argument(
"--no_torch_with_mkl",
dest="torch_with_mkl",
action="store_false",
help="Do not build Torch with Intel MKL",
)
66 changes: 44 additions & 22 deletions smartsim/_core/_install/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

import concurrent.futures
import enum
import fileinput
import itertools
import os
import platform
Expand All @@ -53,8 +54,7 @@
# TODO: check cmake version and use system if possible to avoid conflicts

TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime", "tflite"]


_PathLike = t.Union[str, "os.PathLike[str]"]
_T = t.TypeVar("_T")
_U = t.TypeVar("_U")

Expand Down Expand Up @@ -369,15 +369,15 @@ class _RAIBuildDependency(ABC):
def __rai_dependency_name__(self) -> str: ...

@abstractmethod
def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: ...
def __place_for_rai__(self, target: _PathLike) -> Path: ...

@staticmethod
@abstractmethod
def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: ...


def _place_rai_dep_at(
target: t.Union[str, "os.PathLike[str]"], verbose: bool
target: _PathLike, verbose: bool
) -> t.Callable[[_RAIBuildDependency], Path]:
def _place(dep: _RAIBuildDependency) -> Path:
if verbose:
Expand Down Expand Up @@ -410,6 +410,7 @@ def __init__(
build_onnx: bool = False,
jobs: int = 1,
verbose: bool = False,
torch_with_mkl: bool = True,
) -> None:
super().__init__(
build_env or {},
Expand All @@ -428,6 +429,9 @@ def __init__(
self.libtf_dir = libtf_dir
self.torch_dir = torch_dir

# extra configuration options
self.torch_with_mkl = torch_with_mkl

# Sanity checks
self._validate_platform()

Expand Down Expand Up @@ -517,8 +521,8 @@ def _get_deps_to_fetch_for(
# DLPack is always required
fetchable_deps: t.List[_RAIBuildDependency] = [_DLPackRepository("v0.5_RAI")]
if self.fetch_torch:
pt_dep = _choose_pt_variant(os_)
fetchable_deps.append(pt_dep(arch, device, "2.0.1"))
pt_dep = _choose_pt_variant(os_)(arch, device, "2.0.1", self.torch_with_mkl)
fetchable_deps.append(pt_dep)
if self.fetch_tf:
fetchable_deps.append(_TFArchive(os_, arch, device, "2.13.1"))
if self.fetch_onnx:
Expand Down Expand Up @@ -755,7 +759,7 @@ def url(self) -> str: ...
class _WebGitRepository(_WebLocation):
def clone(
self,
target: t.Union[str, "os.PathLike[str]"],
target: _PathLike,
depth: t.Optional[int] = None,
branch: t.Optional[str] = None,
) -> None:
Expand Down Expand Up @@ -785,7 +789,7 @@ def url(self) -> str:
def __rai_dependency_name__(self) -> str:
return f"dlpack@{self.url}"

def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
def __place_for_rai__(self, target: _PathLike) -> Path:
target = Path(target) / "dlpack"
self.clone(target, branch=self.version, depth=1)
if not target.is_dir():
Expand All @@ -799,7 +803,7 @@ def name(self) -> str:
_, name = self.url.rsplit("/", 1)
return name

def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
def download(self, target: _PathLike) -> Path:
target = Path(target)
if target.is_dir():
target = target / self.name
Expand All @@ -809,28 +813,22 @@ def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:

class _ExtractableWebArchive(_WebArchive, ABC):
@abstractmethod
def _extract_download(
self, download_path: Path, target: t.Union[str, "os.PathLike[str]"]
) -> None: ...
def _extract_download(self, download_path: Path, target: _PathLike) -> None: ...

def extract(self, target: t.Union[str, "os.PathLike[str]"]) -> None:
def extract(self, target: _PathLike) -> None:
with tempfile.TemporaryDirectory() as tmp_dir:
arch_path = self.download(tmp_dir)
self._extract_download(arch_path, target)


class _WebTGZ(_ExtractableWebArchive):
def _extract_download(
self, download_path: Path, target: t.Union[str, "os.PathLike[str]"]
) -> None:
def _extract_download(self, download_path: Path, target: _PathLike) -> None:
with tarfile.open(download_path, "r") as tgz_file:
tgz_file.extractall(target)


class _WebZip(_ExtractableWebArchive):
def _extract_download(
self, download_path: Path, target: t.Union[str, "os.PathLike[str]"]
) -> None:
def _extract_download(self, download_path: Path, target: _PathLike) -> None:
with zipfile.ZipFile(download_path, "r") as zip_file:
zip_file.extractall(target)

Expand All @@ -840,6 +838,7 @@ class _PTArchive(_WebZip, _RAIBuildDependency):
architecture: Architecture
device: Device
version: str
with_mkl: bool

@staticmethod
def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
Expand All @@ -854,7 +853,20 @@ def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
def __rai_dependency_name__(self) -> str:
return f"libtorch@{self.url}"

def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
@staticmethod
def _patch_out_mkl(libtorch_root: Path) -> None:
_modify_source_files(
libtorch_root / "share/cmake/Caffe2/public/mkl.cmake",
r"find_package\(MKL QUIET\)",
"# find_package(MKL QUIET)",
)

def extract(self, target: _PathLike) -> None:
super().extract(target)
if not self.with_mkl:
self._patch_out_mkl(Path(target))

def __place_for_rai__(self, target: _PathLike) -> Path:
self.extract(target)
target = Path(target) / "libtorch"
if not target.is_dir():
Expand Down Expand Up @@ -964,7 +976,7 @@ def url(self) -> str:
def __rai_dependency_name__(self) -> str:
return f"libtensorflow@{self.url}"

def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
def __place_for_rai__(self, target: _PathLike) -> Path:
target = Path(target) / "libtensorflow"
target.mkdir()
self.extract(target)
Expand Down Expand Up @@ -1010,7 +1022,7 @@ def url(self) -> str:
def __rai_dependency_name__(self) -> str:
return f"onnxruntime@{self.url}"

def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
def __place_for_rai__(self, target: _PathLike) -> Path:
target = Path(target).resolve() / "onnxruntime"
self.extract(target)
try:
Expand Down Expand Up @@ -1051,3 +1063,13 @@ def config_git_command(plat: Platform, cmd: t.Sequence[str]) -> t.List[str]:
+ cmd[where:]
)
return cmd


def _modify_source_files(
files: t.Union[_PathLike, t.Iterable[_PathLike]], regex: str, replacement: str
) -> None:
compiled_regex = re.compile(regex)
with fileinput.input(files=files, inplace=True) as handles:
for line in handles:
line = compiled_regex.sub(replacement, line)
print(line, end="")
15 changes: 12 additions & 3 deletions smartsim/_core/control/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,11 @@
from smartsim._core.utils.network import get_ip_from_host

from ..._core.launcher.step import Step
from ..._core.utils.helpers import unpack_colo_db_identifier, unpack_db_identifier
from ..._core.utils.helpers import (
SignalInterceptionStack,
unpack_colo_db_identifier,
unpack_db_identifier,
)
from ..._core.utils.redis import (
db_is_active,
set_ml_model,
Expand Down Expand Up @@ -71,6 +75,8 @@
from .manifest import LaunchedManifest, LaunchedManifestBuilder, Manifest

if t.TYPE_CHECKING:
from types import FrameType

from ..utils.serialize import TStepLaunchMetaData


Expand Down Expand Up @@ -113,8 +119,11 @@ def start(
execution of all jobs.
"""
self._jobs.kill_on_interrupt = kill_on_interrupt

# register custom signal handler for ^C (SIGINT)
signal.signal(signal.SIGINT, self._jobs.signal_interrupt)
SignalInterceptionStack.get(signal.SIGINT).push_unique(
self._jobs.signal_interrupt
)
launched = self._launch(exp_name, exp_path, manifest)

# start the job manager thread if not already started
Expand All @@ -132,7 +141,7 @@ def start(
# block until all non-database jobs are complete
if block:
# poll handles its own keyboard interrupt as
# it may be called seperately
# it may be called separately
self.poll(5, True, kill_on_interrupt=kill_on_interrupt)

@property
Expand Down
2 changes: 1 addition & 1 deletion smartsim/_core/control/jobmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,9 +350,9 @@ def set_db_hosts(self, orchestrator: Orchestrator) -> None:
self.db_jobs[dbnode.name].hosts = dbnode.hosts

def signal_interrupt(self, signo: int, _frame: t.Optional[FrameType]) -> None:
"""Custom handler for whenever SIGINT is received"""
if not signo:
logger.warning("Received SIGINT with no signal number")
"""Custom handler for whenever SIGINT is received"""
if self.actively_monitoring and len(self) > 0:
if self.kill_on_interrupt:
for _, job in self().items():
Expand Down
Loading

0 comments on commit 99f9b65

Please sign in to comment.