Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optionally skip building Torch with Intel MKL #538

Merged
merged 10 commits into from
Apr 5, 2024
7 changes: 7 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ To be released at some future point in time

Description

- Add option to build Torch backend without the Intel Math Kernel Library
- Fix ReadTheDocs build issue
- Promote device options to an Enum
- Update telemetry monitor, add telemetry collectors
Expand All @@ -35,6 +36,11 @@ Description

Detailed Notes

- Add an option to smart build "--torch_with_mkl"/"--no_torch_with_mkl" to
prevent Torch from trying to link in the Intel Math Kernel Library. This
is needed because on machines that have the Intel compilers installed, the
Torch will unconditionally try to link in this library, however fails
because the linking flags are incorrect. (SmartSim-PR538_)
- Change type_extension and pydantic versions in readthedocs environment
to enable docs build. (SmartSim-PR537_)
- Promote devices to a dedicated Enum type throughout the SmartSim code base.
Expand Down Expand Up @@ -77,6 +83,7 @@ Detailed Notes
- Remove previously deprecated behavior present in test suite on machines with
Slurm and Open MPI. (SmartSim-PR520_)

.. _SmartSim-PR538: https://github.com/CrayLabs/SmartSim/pull/538
.. _SmartSim-PR537: https://github.com/CrayLabs/SmartSim/pull/537
.. _SmartSim-PR498: https://github.com/CrayLabs/SmartSim/pull/498
.. _SmartSim-PR460: https://github.com/CrayLabs/SmartSim/pull/460
Expand Down
10 changes: 10 additions & 0 deletions smartsim/_core/_cli/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def build_redis_ai(
torch_dir: t.Union[str, Path, None] = None,
libtf_dir: t.Union[str, Path, None] = None,
verbose: bool = False,
torch_with_mkl: bool = True,
) -> None:
# make sure user isn't trying to do something silly on MacOS
if build_env.PLATFORM == "darwin" and device == Device.GPU:
Expand Down Expand Up @@ -186,6 +187,7 @@ def build_redis_ai(
build_tf=use_tf,
build_onnx=use_onnx,
verbose=verbose,
torch_with_mkl=torch_with_mkl,
)

if rai_builder.is_built:
Expand Down Expand Up @@ -414,6 +416,7 @@ def execute(
args.torch_dir,
args.libtensorflow_dir,
verbose=verbose,
torch_with_mkl=args.torch_with_mkl,
)
except (SetupError, BuildError) as e:
logger.error(str(e))
Expand Down Expand Up @@ -496,3 +499,10 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
default=False,
help="Build KeyDB instead of Redis",
)

MattToast marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument(
"--no_torch_with_mkl",
dest="torch_with_mkl",
action="store_false",
help="Do not build Torch with Intel MKL",
)
66 changes: 44 additions & 22 deletions smartsim/_core/_install/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

import concurrent.futures
import enum
import fileinput
import itertools
import os
import platform
Expand All @@ -53,8 +54,7 @@
# TODO: check cmake version and use system if possible to avoid conflicts

TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime", "tflite"]


_PathLike = t.Union[str, "os.PathLike[str]"]
_T = t.TypeVar("_T")
_U = t.TypeVar("_U")

Expand Down Expand Up @@ -369,15 +369,15 @@ class _RAIBuildDependency(ABC):
def __rai_dependency_name__(self) -> str: ...

@abstractmethod
def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: ...
def __place_for_rai__(self, target: _PathLike) -> Path: ...

@staticmethod
@abstractmethod
def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: ...


def _place_rai_dep_at(
target: t.Union[str, "os.PathLike[str]"], verbose: bool
target: _PathLike, verbose: bool
) -> t.Callable[[_RAIBuildDependency], Path]:
def _place(dep: _RAIBuildDependency) -> Path:
if verbose:
Expand Down Expand Up @@ -410,6 +410,7 @@ def __init__(
build_onnx: bool = False,
jobs: int = 1,
verbose: bool = False,
torch_with_mkl: bool = True,
) -> None:
super().__init__(
build_env or {},
Expand All @@ -428,6 +429,9 @@ def __init__(
self.libtf_dir = libtf_dir
self.torch_dir = torch_dir

# extra configuration options
self.torch_with_mkl = torch_with_mkl

# Sanity checks
self._validate_platform()

Expand Down Expand Up @@ -517,8 +521,8 @@ def _get_deps_to_fetch_for(
# DLPack is always required
fetchable_deps: t.List[_RAIBuildDependency] = [_DLPackRepository("v0.5_RAI")]
if self.fetch_torch:
pt_dep = _choose_pt_variant(os_)
fetchable_deps.append(pt_dep(arch, device, "2.0.1"))
pt_dep = _choose_pt_variant(os_)(arch, device, "2.0.1", self.torch_with_mkl)
fetchable_deps.append(pt_dep)
if self.fetch_tf:
fetchable_deps.append(_TFArchive(os_, arch, device, "2.13.1"))
if self.fetch_onnx:
Expand Down Expand Up @@ -755,7 +759,7 @@ def url(self) -> str: ...
class _WebGitRepository(_WebLocation):
def clone(
self,
target: t.Union[str, "os.PathLike[str]"],
target: _PathLike,
depth: t.Optional[int] = None,
branch: t.Optional[str] = None,
) -> None:
Expand Down Expand Up @@ -785,7 +789,7 @@ def url(self) -> str:
def __rai_dependency_name__(self) -> str:
return f"dlpack@{self.url}"

def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
def __place_for_rai__(self, target: _PathLike) -> Path:
target = Path(target) / "dlpack"
self.clone(target, branch=self.version, depth=1)
if not target.is_dir():
Expand All @@ -799,7 +803,7 @@ def name(self) -> str:
_, name = self.url.rsplit("/", 1)
return name

def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
def download(self, target: _PathLike) -> Path:
target = Path(target)
if target.is_dir():
target = target / self.name
Expand All @@ -809,28 +813,22 @@ def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:

class _ExtractableWebArchive(_WebArchive, ABC):
@abstractmethod
def _extract_download(
self, download_path: Path, target: t.Union[str, "os.PathLike[str]"]
) -> None: ...
def _extract_download(self, download_path: Path, target: _PathLike) -> None: ...

def extract(self, target: t.Union[str, "os.PathLike[str]"]) -> None:
def extract(self, target: _PathLike) -> None:
with tempfile.TemporaryDirectory() as tmp_dir:
arch_path = self.download(tmp_dir)
self._extract_download(arch_path, target)


class _WebTGZ(_ExtractableWebArchive):
def _extract_download(
self, download_path: Path, target: t.Union[str, "os.PathLike[str]"]
) -> None:
def _extract_download(self, download_path: Path, target: _PathLike) -> None:
with tarfile.open(download_path, "r") as tgz_file:
tgz_file.extractall(target)


class _WebZip(_ExtractableWebArchive):
def _extract_download(
self, download_path: Path, target: t.Union[str, "os.PathLike[str]"]
) -> None:
def _extract_download(self, download_path: Path, target: _PathLike) -> None:
with zipfile.ZipFile(download_path, "r") as zip_file:
zip_file.extractall(target)

Expand All @@ -840,6 +838,7 @@ class _PTArchive(_WebZip, _RAIBuildDependency):
architecture: Architecture
device: Device
version: str
with_mkl: bool

@staticmethod
def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
Expand All @@ -854,7 +853,20 @@ def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
def __rai_dependency_name__(self) -> str:
return f"libtorch@{self.url}"

def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
@staticmethod
def _patch_out_mkl(libtorch_root: Path) -> None:
_modify_source_files(
libtorch_root / "share/cmake/Caffe2/public/mkl.cmake",
r"find_package\(MKL QUIET\)",
"# find_package(MKL QUIET)",
)

def extract(self, target: _PathLike) -> None:
super().extract(target)
if not self.with_mkl:
self._patch_out_mkl(Path(target))

def __place_for_rai__(self, target: _PathLike) -> Path:
self.extract(target)
target = Path(target) / "libtorch"
if not target.is_dir():
Expand Down Expand Up @@ -964,7 +976,7 @@ def url(self) -> str:
def __rai_dependency_name__(self) -> str:
return f"libtensorflow@{self.url}"

def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
def __place_for_rai__(self, target: _PathLike) -> Path:
target = Path(target) / "libtensorflow"
target.mkdir()
self.extract(target)
Expand Down Expand Up @@ -1010,7 +1022,7 @@ def url(self) -> str:
def __rai_dependency_name__(self) -> str:
return f"onnxruntime@{self.url}"

def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
def __place_for_rai__(self, target: _PathLike) -> Path:
target = Path(target).resolve() / "onnxruntime"
self.extract(target)
try:
Expand Down Expand Up @@ -1051,3 +1063,13 @@ def config_git_command(plat: Platform, cmd: t.Sequence[str]) -> t.List[str]:
+ cmd[where:]
)
return cmd


def _modify_source_files(
files: t.Union[_PathLike, t.Iterable[_PathLike]], regex: str, replacement: str
) -> None:
ashao marked this conversation as resolved.
Show resolved Hide resolved
compiled_regex = re.compile(regex)
with fileinput.input(files=files, inplace=True) as handles:
for line in handles:
line = compiled_regex.sub(replacement, line)
print(line, end="")
42 changes: 37 additions & 5 deletions tests/install/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@

import functools
import pathlib
import platform
import threading
import textwrap
import time

import pytest
Expand Down Expand Up @@ -254,13 +253,13 @@ def test_PTArchiveMacOSX_url():
pt_version = RAI_VERSIONS.torch

pt_linux_cpu = build._PTArchiveLinux(
build.Architecture.X64, build.Device.CPU, pt_version
build.Architecture.X64, build.Device.CPU, pt_version, False
)
x64_prefix = "https://download.pytorch.org/libtorch/"
assert x64_prefix in pt_linux_cpu.url

pt_macosx_cpu = build._PTArchiveMacOSX(
build.Architecture.ARM64, build.Device.CPU, pt_version
build.Architecture.ARM64, build.Device.CPU, pt_version, False
)
arm64_prefix = "https://github.com/CrayLabs/ml_lib_builder/releases/download/"
assert arm64_prefix in pt_macosx_cpu.url
Expand All @@ -269,7 +268,7 @@ def test_PTArchiveMacOSX_url():
def test_PTArchiveMacOSX_gpu_error():
with pytest.raises(build.BuildError, match="support GPU on Mac OSX"):
build._PTArchiveMacOSX(
build.Architecture.ARM64, build.Device.GPU, RAI_VERSIONS.torch
build.Architecture.ARM64, build.Device.GPU, RAI_VERSIONS.torch, False
).url


Expand Down Expand Up @@ -370,3 +369,36 @@ def test_valid_platforms():
)
def test_git_commands_are_configered_correctly_for_platforms(plat, cmd, expected_cmd):
assert build.config_git_command(plat, cmd) == expected_cmd


def test_modify_source_files(p_test_dir):
def make_text_blurb(food):
return textwrap.dedent(f"""\
My favorite food is {food}
{food} is an important part of a healthy breakfast
{food} {food} {food} {food}
This line should be unchanged!
--> {food} <--
""")

original_word = "SPAM"
mutated_word = "EGGS"

source_files = []
for i in range(3):
source_file = p_test_dir / f"test_{i}"
source_file.touch()
source_file.write_text(make_text_blurb(original_word))
source_files.append(source_file)
# Modify a single file
build._modify_source_files(source_files[0], original_word, mutated_word)
assert source_files[0].read_text() == make_text_blurb(mutated_word)
assert source_files[1].read_text() == make_text_blurb(original_word)
assert source_files[2].read_text() == make_text_blurb(original_word)

# Modify multiple files
build._modify_source_files(
(source_files[1], source_files[2]), original_word, mutated_word
)
assert source_files[1].read_text() == make_text_blurb(mutated_word)
assert source_files[2].read_text() == make_text_blurb(mutated_word)
Loading