From 619d64b8609e1cd136435f172efae534f38c4f88 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Fri, 5 Apr 2024 10:34:52 -0700 Subject: [PATCH] Optionally skip building Torch with Intel MKL (#538) On systems that have the Intel Compilers and/or the Intel Math Kernel library installed, the Caffe2 package that comes with Torch will unconditionally try to link in the MKL during the Torch backend. This however can lead to two types of failures: - Problems when compiling the Torch backend because the linker does not include the path to the MKL library path - Loading the Torch backend into RedisAI fails because the user does not expect to need to have the MKL library loaded. To alleviate this, a new option "--no_torch_with_mkl" has been added to the `smart build` command that modifies the mkl.cmake file to prevent the detection of MKL. [ committed by @ashao ] [ reviewed by @MattToast and @al-rigazzi ] --- doc/changelog.rst | 7 ++++ smartsim/_core/_cli/build.py | 10 +++++ smartsim/_core/_install/builder.py | 66 ++++++++++++++++++++---------- tests/install/test_builder.py | 42 ++++++++++++++++--- 4 files changed, 98 insertions(+), 27 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index 3e73101e1..5e7b89f0f 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Add option to build Torch backend without the Intel Math Kernel Library - Fix ReadTheDocs build issue - Promote device options to an Enum - Update telemetry monitor, add telemetry collectors @@ -35,6 +36,11 @@ Description Detailed Notes +- Add an option to smart build "--torch_with_mkl"/"--no_torch_with_mkl" to + prevent Torch from trying to link in the Intel Math Kernel Library. This + is needed because on machines that have the Intel compilers installed, the + Torch will unconditionally try to link in this library, however fails + because the linking flags are incorrect. (SmartSim-PR538_) - Change type_extension and pydantic versions in readthedocs environment to enable docs build. (SmartSim-PR537_) - Promote devices to a dedicated Enum type throughout the SmartSim code base. @@ -77,6 +83,7 @@ Detailed Notes - Remove previously deprecated behavior present in test suite on machines with Slurm and Open MPI. (SmartSim-PR520_) +.. _SmartSim-PR538: https://github.com/CrayLabs/SmartSim/pull/538 .. _SmartSim-PR537: https://github.com/CrayLabs/SmartSim/pull/537 .. _SmartSim-PR498: https://github.com/CrayLabs/SmartSim/pull/498 .. _SmartSim-PR460: https://github.com/CrayLabs/SmartSim/pull/460 diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index 08a1a6138..ab982ac1b 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -139,6 +139,7 @@ def build_redis_ai( torch_dir: t.Union[str, Path, None] = None, libtf_dir: t.Union[str, Path, None] = None, verbose: bool = False, + torch_with_mkl: bool = True, ) -> None: # make sure user isn't trying to do something silly on MacOS if build_env.PLATFORM == "darwin" and device == Device.GPU: @@ -186,6 +187,7 @@ def build_redis_ai( build_tf=use_tf, build_onnx=use_onnx, verbose=verbose, + torch_with_mkl=torch_with_mkl, ) if rai_builder.is_built: @@ -414,6 +416,7 @@ def execute( args.torch_dir, args.libtensorflow_dir, verbose=verbose, + torch_with_mkl=args.torch_with_mkl, ) except (SetupError, BuildError) as e: logger.error(str(e)) @@ -496,3 +499,10 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: default=False, help="Build KeyDB instead of Redis", ) + + parser.add_argument( + "--no_torch_with_mkl", + dest="torch_with_mkl", + action="store_false", + help="Do not build Torch with Intel MKL", + ) diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index 47f12d044..d0dbc5a6a 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -28,6 +28,7 @@ import concurrent.futures import enum +import fileinput import itertools import os import platform @@ -53,8 +54,7 @@ # TODO: check cmake version and use system if possible to avoid conflicts TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime", "tflite"] - - +_PathLike = t.Union[str, "os.PathLike[str]"] _T = t.TypeVar("_T") _U = t.TypeVar("_U") @@ -369,7 +369,7 @@ class _RAIBuildDependency(ABC): def __rai_dependency_name__(self) -> str: ... @abstractmethod - def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: ... + def __place_for_rai__(self, target: _PathLike) -> Path: ... @staticmethod @abstractmethod @@ -377,7 +377,7 @@ def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: def _place_rai_dep_at( - target: t.Union[str, "os.PathLike[str]"], verbose: bool + target: _PathLike, verbose: bool ) -> t.Callable[[_RAIBuildDependency], Path]: def _place(dep: _RAIBuildDependency) -> Path: if verbose: @@ -410,6 +410,7 @@ def __init__( build_onnx: bool = False, jobs: int = 1, verbose: bool = False, + torch_with_mkl: bool = True, ) -> None: super().__init__( build_env or {}, @@ -428,6 +429,9 @@ def __init__( self.libtf_dir = libtf_dir self.torch_dir = torch_dir + # extra configuration options + self.torch_with_mkl = torch_with_mkl + # Sanity checks self._validate_platform() @@ -517,8 +521,8 @@ def _get_deps_to_fetch_for( # DLPack is always required fetchable_deps: t.List[_RAIBuildDependency] = [_DLPackRepository("v0.5_RAI")] if self.fetch_torch: - pt_dep = _choose_pt_variant(os_) - fetchable_deps.append(pt_dep(arch, device, "2.0.1")) + pt_dep = _choose_pt_variant(os_)(arch, device, "2.0.1", self.torch_with_mkl) + fetchable_deps.append(pt_dep) if self.fetch_tf: fetchable_deps.append(_TFArchive(os_, arch, device, "2.13.1")) if self.fetch_onnx: @@ -755,7 +759,7 @@ def url(self) -> str: ... class _WebGitRepository(_WebLocation): def clone( self, - target: t.Union[str, "os.PathLike[str]"], + target: _PathLike, depth: t.Optional[int] = None, branch: t.Optional[str] = None, ) -> None: @@ -785,7 +789,7 @@ def url(self) -> str: def __rai_dependency_name__(self) -> str: return f"dlpack@{self.url}" - def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + def __place_for_rai__(self, target: _PathLike) -> Path: target = Path(target) / "dlpack" self.clone(target, branch=self.version, depth=1) if not target.is_dir(): @@ -799,7 +803,7 @@ def name(self) -> str: _, name = self.url.rsplit("/", 1) return name - def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + def download(self, target: _PathLike) -> Path: target = Path(target) if target.is_dir(): target = target / self.name @@ -809,28 +813,22 @@ def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: class _ExtractableWebArchive(_WebArchive, ABC): @abstractmethod - def _extract_download( - self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] - ) -> None: ... + def _extract_download(self, download_path: Path, target: _PathLike) -> None: ... - def extract(self, target: t.Union[str, "os.PathLike[str]"]) -> None: + def extract(self, target: _PathLike) -> None: with tempfile.TemporaryDirectory() as tmp_dir: arch_path = self.download(tmp_dir) self._extract_download(arch_path, target) class _WebTGZ(_ExtractableWebArchive): - def _extract_download( - self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] - ) -> None: + def _extract_download(self, download_path: Path, target: _PathLike) -> None: with tarfile.open(download_path, "r") as tgz_file: tgz_file.extractall(target) class _WebZip(_ExtractableWebArchive): - def _extract_download( - self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] - ) -> None: + def _extract_download(self, download_path: Path, target: _PathLike) -> None: with zipfile.ZipFile(download_path, "r") as zip_file: zip_file.extractall(target) @@ -840,6 +838,7 @@ class _PTArchive(_WebZip, _RAIBuildDependency): architecture: Architecture device: Device version: str + with_mkl: bool @staticmethod def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: @@ -854,7 +853,20 @@ def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: def __rai_dependency_name__(self) -> str: return f"libtorch@{self.url}" - def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + @staticmethod + def _patch_out_mkl(libtorch_root: Path) -> None: + _modify_source_files( + libtorch_root / "share/cmake/Caffe2/public/mkl.cmake", + r"find_package\(MKL QUIET\)", + "# find_package(MKL QUIET)", + ) + + def extract(self, target: _PathLike) -> None: + super().extract(target) + if not self.with_mkl: + self._patch_out_mkl(Path(target)) + + def __place_for_rai__(self, target: _PathLike) -> Path: self.extract(target) target = Path(target) / "libtorch" if not target.is_dir(): @@ -964,7 +976,7 @@ def url(self) -> str: def __rai_dependency_name__(self) -> str: return f"libtensorflow@{self.url}" - def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + def __place_for_rai__(self, target: _PathLike) -> Path: target = Path(target) / "libtensorflow" target.mkdir() self.extract(target) @@ -1010,7 +1022,7 @@ def url(self) -> str: def __rai_dependency_name__(self) -> str: return f"onnxruntime@{self.url}" - def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + def __place_for_rai__(self, target: _PathLike) -> Path: target = Path(target).resolve() / "onnxruntime" self.extract(target) try: @@ -1051,3 +1063,13 @@ def config_git_command(plat: Platform, cmd: t.Sequence[str]) -> t.List[str]: + cmd[where:] ) return cmd + + +def _modify_source_files( + files: t.Union[_PathLike, t.Iterable[_PathLike]], regex: str, replacement: str +) -> None: + compiled_regex = re.compile(regex) + with fileinput.input(files=files, inplace=True) as handles: + for line in handles: + line = compiled_regex.sub(replacement, line) + print(line, end="") diff --git a/tests/install/test_builder.py b/tests/install/test_builder.py index c69a083d1..feaf7e54f 100644 --- a/tests/install/test_builder.py +++ b/tests/install/test_builder.py @@ -27,8 +27,7 @@ import functools import pathlib -import platform -import threading +import textwrap import time import pytest @@ -254,13 +253,13 @@ def test_PTArchiveMacOSX_url(): pt_version = RAI_VERSIONS.torch pt_linux_cpu = build._PTArchiveLinux( - build.Architecture.X64, build.Device.CPU, pt_version + build.Architecture.X64, build.Device.CPU, pt_version, False ) x64_prefix = "https://download.pytorch.org/libtorch/" assert x64_prefix in pt_linux_cpu.url pt_macosx_cpu = build._PTArchiveMacOSX( - build.Architecture.ARM64, build.Device.CPU, pt_version + build.Architecture.ARM64, build.Device.CPU, pt_version, False ) arm64_prefix = "https://github.com/CrayLabs/ml_lib_builder/releases/download/" assert arm64_prefix in pt_macosx_cpu.url @@ -269,7 +268,7 @@ def test_PTArchiveMacOSX_url(): def test_PTArchiveMacOSX_gpu_error(): with pytest.raises(build.BuildError, match="support GPU on Mac OSX"): build._PTArchiveMacOSX( - build.Architecture.ARM64, build.Device.GPU, RAI_VERSIONS.torch + build.Architecture.ARM64, build.Device.GPU, RAI_VERSIONS.torch, False ).url @@ -370,3 +369,36 @@ def test_valid_platforms(): ) def test_git_commands_are_configered_correctly_for_platforms(plat, cmd, expected_cmd): assert build.config_git_command(plat, cmd) == expected_cmd + + +def test_modify_source_files(p_test_dir): + def make_text_blurb(food): + return textwrap.dedent(f"""\ + My favorite food is {food} + {food} is an important part of a healthy breakfast + {food} {food} {food} {food} + This line should be unchanged! + --> {food} <-- + """) + + original_word = "SPAM" + mutated_word = "EGGS" + + source_files = [] + for i in range(3): + source_file = p_test_dir / f"test_{i}" + source_file.touch() + source_file.write_text(make_text_blurb(original_word)) + source_files.append(source_file) + # Modify a single file + build._modify_source_files(source_files[0], original_word, mutated_word) + assert source_files[0].read_text() == make_text_blurb(mutated_word) + assert source_files[1].read_text() == make_text_blurb(original_word) + assert source_files[2].read_text() == make_text_blurb(original_word) + + # Modify multiple files + build._modify_source_files( + (source_files[1], source_files[2]), original_word, mutated_word + ) + assert source_files[1].read_text() == make_text_blurb(mutated_word) + assert source_files[2].read_text() == make_text_blurb(mutated_word)