From 619d64b8609e1cd136435f172efae534f38c4f88 Mon Sep 17 00:00:00 2001
From: Andrew Shao <andrew.shao@hpe.com>
Date: Fri, 5 Apr 2024 10:34:52 -0700
Subject: [PATCH] Optionally skip building Torch with Intel MKL (#538)

On systems that have the Intel Compilers and/or the Intel Math Kernel
library installed, the Caffe2 package that comes with Torch will
unconditionally try to link in the MKL during the Torch backend. This
however can lead to two types of failures:
- Problems when compiling the Torch backend because the linker does not
include the path to the MKL library path
- Loading the Torch backend into RedisAI fails because the user does not
expect to need to have the MKL library loaded.

To alleviate this, a new option "--no_torch_with_mkl" has been added to
the `smart build` command that modifies the mkl.cmake file to prevent
the detection of MKL.

[ committed by @ashao ]
[ reviewed by @MattToast and @al-rigazzi  ]
---
 doc/changelog.rst                  |  7 ++++
 smartsim/_core/_cli/build.py       | 10 +++++
 smartsim/_core/_install/builder.py | 66 ++++++++++++++++++++----------
 tests/install/test_builder.py      | 42 ++++++++++++++++---
 4 files changed, 98 insertions(+), 27 deletions(-)

diff --git a/doc/changelog.rst b/doc/changelog.rst
index 3e73101e1..5e7b89f0f 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -18,6 +18,7 @@ To be released at some future point in time
 
 Description
 
+- Add option to build Torch backend without the Intel Math Kernel Library
 - Fix ReadTheDocs build issue
 - Promote device options to an Enum
 - Update telemetry monitor, add telemetry collectors
@@ -35,6 +36,11 @@ Description
 
 Detailed Notes
 
+- Add an option to smart build "--torch_with_mkl"/"--no_torch_with_mkl" to
+  prevent Torch from trying to link in the Intel Math Kernel Library. This
+  is needed because on machines that have the Intel compilers installed, the
+  Torch will unconditionally try to link in this library, however fails
+  because the linking flags are incorrect. (SmartSim-PR538_)
 - Change type_extension and pydantic versions in readthedocs environment
   to enable docs build. (SmartSim-PR537_)
 - Promote devices to a dedicated Enum type throughout the SmartSim code base.
@@ -77,6 +83,7 @@ Detailed Notes
 - Remove previously deprecated behavior present in test suite on machines with
   Slurm and Open MPI. (SmartSim-PR520_)
 
+.. _SmartSim-PR538: https://github.com/CrayLabs/SmartSim/pull/538
 .. _SmartSim-PR537: https://github.com/CrayLabs/SmartSim/pull/537
 .. _SmartSim-PR498: https://github.com/CrayLabs/SmartSim/pull/498
 .. _SmartSim-PR460: https://github.com/CrayLabs/SmartSim/pull/460
diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py
index 08a1a6138..ab982ac1b 100644
--- a/smartsim/_core/_cli/build.py
+++ b/smartsim/_core/_cli/build.py
@@ -139,6 +139,7 @@ def build_redis_ai(
     torch_dir: t.Union[str, Path, None] = None,
     libtf_dir: t.Union[str, Path, None] = None,
     verbose: bool = False,
+    torch_with_mkl: bool = True,
 ) -> None:
     # make sure user isn't trying to do something silly on MacOS
     if build_env.PLATFORM == "darwin" and device == Device.GPU:
@@ -186,6 +187,7 @@ def build_redis_ai(
         build_tf=use_tf,
         build_onnx=use_onnx,
         verbose=verbose,
+        torch_with_mkl=torch_with_mkl,
     )
 
     if rai_builder.is_built:
@@ -414,6 +416,7 @@ def execute(
                 args.torch_dir,
                 args.libtensorflow_dir,
                 verbose=verbose,
+                torch_with_mkl=args.torch_with_mkl,
             )
     except (SetupError, BuildError) as e:
         logger.error(str(e))
@@ -496,3 +499,10 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
         default=False,
         help="Build KeyDB instead of Redis",
     )
+
+    parser.add_argument(
+        "--no_torch_with_mkl",
+        dest="torch_with_mkl",
+        action="store_false",
+        help="Do not build Torch with Intel MKL",
+    )
diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py
index 47f12d044..d0dbc5a6a 100644
--- a/smartsim/_core/_install/builder.py
+++ b/smartsim/_core/_install/builder.py
@@ -28,6 +28,7 @@
 
 import concurrent.futures
 import enum
+import fileinput
 import itertools
 import os
 import platform
@@ -53,8 +54,7 @@
 # TODO: check cmake version and use system if possible to avoid conflicts
 
 TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime", "tflite"]
-
-
+_PathLike = t.Union[str, "os.PathLike[str]"]
 _T = t.TypeVar("_T")
 _U = t.TypeVar("_U")
 
@@ -369,7 +369,7 @@ class _RAIBuildDependency(ABC):
     def __rai_dependency_name__(self) -> str: ...
 
     @abstractmethod
-    def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: ...
+    def __place_for_rai__(self, target: _PathLike) -> Path: ...
 
     @staticmethod
     @abstractmethod
@@ -377,7 +377,7 @@ def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
 
 
 def _place_rai_dep_at(
-    target: t.Union[str, "os.PathLike[str]"], verbose: bool
+    target: _PathLike, verbose: bool
 ) -> t.Callable[[_RAIBuildDependency], Path]:
     def _place(dep: _RAIBuildDependency) -> Path:
         if verbose:
@@ -410,6 +410,7 @@ def __init__(
         build_onnx: bool = False,
         jobs: int = 1,
         verbose: bool = False,
+        torch_with_mkl: bool = True,
     ) -> None:
         super().__init__(
             build_env or {},
@@ -428,6 +429,9 @@ def __init__(
         self.libtf_dir = libtf_dir
         self.torch_dir = torch_dir
 
+        # extra configuration options
+        self.torch_with_mkl = torch_with_mkl
+
         # Sanity checks
         self._validate_platform()
 
@@ -517,8 +521,8 @@ def _get_deps_to_fetch_for(
         # DLPack is always required
         fetchable_deps: t.List[_RAIBuildDependency] = [_DLPackRepository("v0.5_RAI")]
         if self.fetch_torch:
-            pt_dep = _choose_pt_variant(os_)
-            fetchable_deps.append(pt_dep(arch, device, "2.0.1"))
+            pt_dep = _choose_pt_variant(os_)(arch, device, "2.0.1", self.torch_with_mkl)
+            fetchable_deps.append(pt_dep)
         if self.fetch_tf:
             fetchable_deps.append(_TFArchive(os_, arch, device, "2.13.1"))
         if self.fetch_onnx:
@@ -755,7 +759,7 @@ def url(self) -> str: ...
 class _WebGitRepository(_WebLocation):
     def clone(
         self,
-        target: t.Union[str, "os.PathLike[str]"],
+        target: _PathLike,
         depth: t.Optional[int] = None,
         branch: t.Optional[str] = None,
     ) -> None:
@@ -785,7 +789,7 @@ def url(self) -> str:
     def __rai_dependency_name__(self) -> str:
         return f"dlpack@{self.url}"
 
-    def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
+    def __place_for_rai__(self, target: _PathLike) -> Path:
         target = Path(target) / "dlpack"
         self.clone(target, branch=self.version, depth=1)
         if not target.is_dir():
@@ -799,7 +803,7 @@ def name(self) -> str:
         _, name = self.url.rsplit("/", 1)
         return name
 
-    def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
+    def download(self, target: _PathLike) -> Path:
         target = Path(target)
         if target.is_dir():
             target = target / self.name
@@ -809,28 +813,22 @@ def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
 
 class _ExtractableWebArchive(_WebArchive, ABC):
     @abstractmethod
-    def _extract_download(
-        self, download_path: Path, target: t.Union[str, "os.PathLike[str]"]
-    ) -> None: ...
+    def _extract_download(self, download_path: Path, target: _PathLike) -> None: ...
 
-    def extract(self, target: t.Union[str, "os.PathLike[str]"]) -> None:
+    def extract(self, target: _PathLike) -> None:
         with tempfile.TemporaryDirectory() as tmp_dir:
             arch_path = self.download(tmp_dir)
             self._extract_download(arch_path, target)
 
 
 class _WebTGZ(_ExtractableWebArchive):
-    def _extract_download(
-        self, download_path: Path, target: t.Union[str, "os.PathLike[str]"]
-    ) -> None:
+    def _extract_download(self, download_path: Path, target: _PathLike) -> None:
         with tarfile.open(download_path, "r") as tgz_file:
             tgz_file.extractall(target)
 
 
 class _WebZip(_ExtractableWebArchive):
-    def _extract_download(
-        self, download_path: Path, target: t.Union[str, "os.PathLike[str]"]
-    ) -> None:
+    def _extract_download(self, download_path: Path, target: _PathLike) -> None:
         with zipfile.ZipFile(download_path, "r") as zip_file:
             zip_file.extractall(target)
 
@@ -840,6 +838,7 @@ class _PTArchive(_WebZip, _RAIBuildDependency):
     architecture: Architecture
     device: Device
     version: str
+    with_mkl: bool
 
     @staticmethod
     def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
@@ -854,7 +853,20 @@ def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]:
     def __rai_dependency_name__(self) -> str:
         return f"libtorch@{self.url}"
 
-    def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
+    @staticmethod
+    def _patch_out_mkl(libtorch_root: Path) -> None:
+        _modify_source_files(
+            libtorch_root / "share/cmake/Caffe2/public/mkl.cmake",
+            r"find_package\(MKL QUIET\)",
+            "# find_package(MKL QUIET)",
+        )
+
+    def extract(self, target: _PathLike) -> None:
+        super().extract(target)
+        if not self.with_mkl:
+            self._patch_out_mkl(Path(target))
+
+    def __place_for_rai__(self, target: _PathLike) -> Path:
         self.extract(target)
         target = Path(target) / "libtorch"
         if not target.is_dir():
@@ -964,7 +976,7 @@ def url(self) -> str:
     def __rai_dependency_name__(self) -> str:
         return f"libtensorflow@{self.url}"
 
-    def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
+    def __place_for_rai__(self, target: _PathLike) -> Path:
         target = Path(target) / "libtensorflow"
         target.mkdir()
         self.extract(target)
@@ -1010,7 +1022,7 @@ def url(self) -> str:
     def __rai_dependency_name__(self) -> str:
         return f"onnxruntime@{self.url}"
 
-    def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path:
+    def __place_for_rai__(self, target: _PathLike) -> Path:
         target = Path(target).resolve() / "onnxruntime"
         self.extract(target)
         try:
@@ -1051,3 +1063,13 @@ def config_git_command(plat: Platform, cmd: t.Sequence[str]) -> t.List[str]:
             + cmd[where:]
         )
     return cmd
+
+
+def _modify_source_files(
+    files: t.Union[_PathLike, t.Iterable[_PathLike]], regex: str, replacement: str
+) -> None:
+    compiled_regex = re.compile(regex)
+    with fileinput.input(files=files, inplace=True) as handles:
+        for line in handles:
+            line = compiled_regex.sub(replacement, line)
+            print(line, end="")
diff --git a/tests/install/test_builder.py b/tests/install/test_builder.py
index c69a083d1..feaf7e54f 100644
--- a/tests/install/test_builder.py
+++ b/tests/install/test_builder.py
@@ -27,8 +27,7 @@
 
 import functools
 import pathlib
-import platform
-import threading
+import textwrap
 import time
 
 import pytest
@@ -254,13 +253,13 @@ def test_PTArchiveMacOSX_url():
     pt_version = RAI_VERSIONS.torch
 
     pt_linux_cpu = build._PTArchiveLinux(
-        build.Architecture.X64, build.Device.CPU, pt_version
+        build.Architecture.X64, build.Device.CPU, pt_version, False
     )
     x64_prefix = "https://download.pytorch.org/libtorch/"
     assert x64_prefix in pt_linux_cpu.url
 
     pt_macosx_cpu = build._PTArchiveMacOSX(
-        build.Architecture.ARM64, build.Device.CPU, pt_version
+        build.Architecture.ARM64, build.Device.CPU, pt_version, False
     )
     arm64_prefix = "https://github.com/CrayLabs/ml_lib_builder/releases/download/"
     assert arm64_prefix in pt_macosx_cpu.url
@@ -269,7 +268,7 @@ def test_PTArchiveMacOSX_url():
 def test_PTArchiveMacOSX_gpu_error():
     with pytest.raises(build.BuildError, match="support GPU on Mac OSX"):
         build._PTArchiveMacOSX(
-            build.Architecture.ARM64, build.Device.GPU, RAI_VERSIONS.torch
+            build.Architecture.ARM64, build.Device.GPU, RAI_VERSIONS.torch, False
         ).url
 
 
@@ -370,3 +369,36 @@ def test_valid_platforms():
 )
 def test_git_commands_are_configered_correctly_for_platforms(plat, cmd, expected_cmd):
     assert build.config_git_command(plat, cmd) == expected_cmd
+
+
+def test_modify_source_files(p_test_dir):
+    def make_text_blurb(food):
+        return textwrap.dedent(f"""\
+            My favorite food is {food}
+            {food} is an important part of a healthy breakfast
+            {food} {food} {food} {food}
+            This line should be unchanged!
+            --> {food} <--
+            """)
+
+    original_word = "SPAM"
+    mutated_word = "EGGS"
+
+    source_files = []
+    for i in range(3):
+        source_file = p_test_dir / f"test_{i}"
+        source_file.touch()
+        source_file.write_text(make_text_blurb(original_word))
+        source_files.append(source_file)
+    # Modify a single file
+    build._modify_source_files(source_files[0], original_word, mutated_word)
+    assert source_files[0].read_text() == make_text_blurb(mutated_word)
+    assert source_files[1].read_text() == make_text_blurb(original_word)
+    assert source_files[2].read_text() == make_text_blurb(original_word)
+
+    # Modify multiple files
+    build._modify_source_files(
+        (source_files[1], source_files[2]), original_word, mutated_word
+    )
+    assert source_files[1].read_text() == make_text_blurb(mutated_word)
+    assert source_files[2].read_text() == make_text_blurb(mutated_word)