From 1d60e5363eff02d0b1a70c6c64b6da5c6bb59009 Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Wed, 10 Jul 2024 14:48:22 -0700
Subject: [PATCH 01/14] feat: engine caching

revert backend changes

update dynamo path

add save_engine_cache and load_engine_cache args

support customizing engine cache class

refactor and add LRU to clear cache

fix bug
---
 examples/dynamo/engine_caching_example.py     | 174 ++++++++++++++
 py/torch_tensorrt/dynamo/_compiler.py         |  39 ++++
 py/torch_tensorrt/dynamo/_defaults.py         |  12 +-
 py/torch_tensorrt/dynamo/_engine_caching.py   | 212 ++++++++++++++++++
 py/torch_tensorrt/dynamo/_settings.py         |  16 ++
 py/torch_tensorrt/dynamo/backend/backends.py  |   2 +-
 .../dynamo/conversion/_TRTInterpreter.py      |  40 ++++
 7 files changed, 493 insertions(+), 2 deletions(-)
 create mode 100644 examples/dynamo/engine_caching_example.py
 create mode 100644 py/torch_tensorrt/dynamo/_engine_caching.py

diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
new file mode 100644
index 0000000000..a7b8f02f7a
--- /dev/null
+++ b/examples/dynamo/engine_caching_example.py
@@ -0,0 +1,174 @@
+import ast
+import logging
+import os
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch_tensorrt as torch_trt
+import torchvision.models as models
+from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+
+np.random.seed(0)
+torch.manual_seed(0)
+size = (100, 3, 224, 224)
+
+model = models.resnet18(pretrained=True).eval().to("cuda")
+enabled_precisions = {torch.float}
+debug = False
+min_block_size = 1
+use_python_runtime = False
+
+
+def remove_timing_cache(path=TIMING_CACHE_PATH):
+    if os.path.exists(path):
+        os.remove(path)
+
+
+def dynamo_path(iterations=3):
+    times = []
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),)
+    # Mark the dim0 of inputs as dynamic
+    batch = torch.export.Dim("batch", min=1, max=200)
+    exp_program = torch.export.export(
+        model, args=example_inputs, dynamic_shapes={"x": {0: batch}}
+    )
+
+    for i in range(iterations):
+        inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
+        remove_timing_cache()  # remove timing cache for engine caching messurement
+        if i == 0:
+            save_engine_cache = False
+            load_engine_cache = False
+        else:
+            save_engine_cache = True
+            load_engine_cache = True
+
+        start.record()
+        trt_gm = torch_trt.dynamo.compile(
+            exp_program,
+            tuple(inputs),
+            use_python_runtime=use_python_runtime,
+            enabled_precisions=enabled_precisions,
+            debug=debug,
+            min_block_size=min_block_size,
+            make_refitable=True,
+            save_engine_cache=save_engine_cache,
+            load_engine_cache=load_engine_cache,
+            engine_cache_size=1 << 30,  # 1GB
+        )
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+
+    print("-----dynamo_path-----> compilation time:", times, "milliseconds")
+
+
+# Custom Engine Cache
+class MyEngineCache(BaseEngineCache):
+
+    def __init__(
+        self,
+        engine_cache_size: int,
+        engine_cache_dir: str,
+    ) -> None:
+        self.total_engine_cache_size = engine_cache_size
+        self.available_engine_cache_size = engine_cache_size
+        self.engine_cache_dir = engine_cache_dir
+
+    def save(
+        self,
+        hash: str,
+        serialized_engine: bytes,
+        input_names: List[str],
+        output_names: List[str],
+    ) -> bool:
+        path = os.path.join(
+            self.engine_cache_dir,
+            f"{hash}/engine--{input_names}--{output_names}.trt",
+        )
+        try:
+            os.makedirs(os.path.dirname(path), exist_ok=True)
+            with open(path, "wb") as f:
+                f.write(serialized_engine)
+        except Exception as e:
+            _LOGGER.warning(f"Failed to save the TRT engine to {path}: {e}")
+            return False
+
+        _LOGGER.info(f"A TRT engine was cached to {path}")
+        serialized_engine_size = int(serialized_engine.nbytes)
+        self.available_engine_cache_size -= serialized_engine_size
+        return True
+
+    def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]:
+        directory = os.path.join(self.engine_cache_dir, hash)
+        if os.path.exists(directory):
+            engine_list = os.listdir(directory)
+            assert (
+                len(engine_list) == 1
+            ), f"There are more than one engine {engine_list} under {directory}."
+            path = os.path.join(directory, engine_list[0])
+            input_names_str, output_names_str = (
+                engine_list[0].split(".trt")[0].split("--")[1:]
+            )
+            input_names = ast.literal_eval(input_names_str)
+            output_names = ast.literal_eval(output_names_str)
+            with open(path, "rb") as f:
+                serialized_engine = f.read()
+                return serialized_engine, input_names, output_names
+        else:
+            return None, [], []
+
+
+def compile_path(iterations=3):
+    times = []
+    engine_cache = MyEngineCache(200 * (1 << 20), "/tmp/your_dir")
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    for i in range(iterations):
+        inputs = [torch.rand(size).to("cuda")]
+        # remove timing cache and reset dynamo for engine caching messurement
+        remove_timing_cache()
+        torch._dynamo.reset()
+
+        if i == 0:
+            save_engine_cache = False
+            load_engine_cache = False
+        else:
+            save_engine_cache = True
+            load_engine_cache = True
+
+        start.record()
+        compiled_model = torch.compile(
+            model,
+            backend="tensorrt",
+            options={
+                "use_python_runtime": use_python_runtime,
+                "enabled_precisions": enabled_precisions,
+                "debug": debug,
+                "min_block_size": min_block_size,
+                "make_refitable": True,
+                "save_engine_cache": save_engine_cache,
+                "load_engine_cache": load_engine_cache,
+                "engine_cache_instance": engine_cache,  # use custom engine cache
+            },
+        )
+        compiled_model(*inputs)  # trigger the compilation
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+
+    print("-----compile_path-----> compilation time:", times, "milliseconds")
+
+
+if __name__ == "__main__":
+    dynamo_path()
+    compile_path()
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index a4849f257e..229ecb5ef7 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -18,6 +18,7 @@
     dryrun_stats_display,
     parse_non_trt_nodes,
 )
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache, EngineCache
 from torch_tensorrt.dynamo.conversion import (
     CompilationSettings,
     UnsupportedOperatorException,
@@ -82,6 +83,11 @@ def compile(
     hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE,
     timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
     lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT,
+    save_engine_cache: bool = _defaults.SAVE_ENGINE_CACHE,
+    load_engine_cache: bool = _defaults.LOAD_ENGINE_CACHE,
+    engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR,
+    engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE,
+    engine_cache_instance: Optional[BaseEngineCache] = None,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -147,6 +153,11 @@ def compile(
         hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
         lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
+        save_engine_cache (bool): Whether to save the compiled TRT engines to hard disk
+        load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk
+        engine_cache_dir (str): Directory to store the cached TRT engines
+        engine_cache_size (int): Maximum hard-disk space to use for the engine cache
+        engine_cache_instance (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -224,6 +235,11 @@ def compile(
     gm = post_lowering(gm)
     logger.debug("Lowered Input graph: " + str(gm.graph))
 
+    if engine_cache_instance is None:
+        engine_cache_instance = EngineCacheInstanceCreator.get_creator(
+            engine_cache_size, engine_cache_dir
+        ).engine_cache_instance
+
     compilation_options = {
         "enabled_precisions": (
             enabled_precisions if enabled_precisions else _defaults.ENABLED_PRECISIONS
@@ -257,6 +273,11 @@ def compile(
         "hardware_compatible": hardware_compatible,
         "timing_cache_path": timing_cache_path,
         "lazy_engine_init": lazy_engine_init,
+        "save_engine_cache": save_engine_cache,
+        "load_engine_cache": load_engine_cache,
+        "engine_cache_dir": engine_cache_dir,
+        "engine_cache_size": engine_cache_size,
+        "engine_cache_instance": engine_cache_instance,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -665,3 +686,21 @@ def convert_exported_program_to_serialized_trt_engine(
 
     serialized_engine: bytes = interpreter_result.serialized_engine
     return serialized_engine
+
+
+class EngineCacheInstanceCreator:
+    engine_cache_creator = None
+
+    def __init__(self, engine_cache_size: int, engine_cache_dir: str) -> None:
+        self.engine_cache_instance = EngineCache(
+            engine_cache_size=engine_cache_size,
+            engine_cache_dir=engine_cache_dir,
+        )
+
+    @classmethod
+    def get_creator(
+        cls, engine_cache_size: int, engine_cache_dir: str
+    ) -> EngineCacheInstanceCreator:
+        if cls.engine_cache_creator is None:
+            cls.engine_cache_creator = cls(engine_cache_size, engine_cache_dir)
+        return cls.engine_cache_creator
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
index 2696e26936..e90f3f8c2a 100644
--- a/py/torch_tensorrt/dynamo/_defaults.py
+++ b/py/torch_tensorrt/dynamo/_defaults.py
@@ -4,6 +4,7 @@
 import torch
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import EngineCapability, dtype
+from torch_tensorrt.dynamo._engine_caching import EngineCache
 
 ENABLED_PRECISIONS = {dtype.f32}
 DEBUG = False
@@ -31,8 +32,17 @@
 DRYRUN = False
 HARDWARE_COMPATIBLE = False
 SUPPORTED_KERNEL_PRECISIONS = {dtype.f32, dtype.f16, dtype.bf16, dtype.i8, dtype.f8}
-TIMING_CACHE_PATH = os.path.join(tempfile.gettempdir(), "timing_cache.bin")
+TIMING_CACHE_PATH = os.path.join(
+    tempfile.gettempdir(), "torch_tensorrt_engine_cache", "timing_cache.bin"
+)
 LAZY_ENGINE_INIT = False
+SAVE_ENGINE_CACHE = True
+LOAD_ENGINE_CACHE = True
+ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache")
+ENGINE_CACHE_SIZE = 1073741824
+ENGINE_CACHE_INSTANCE = EngineCache(
+    engine_cache_size=ENGINE_CACHE_SIZE, engine_cache_dir=ENGINE_CACHE_DIR
+)
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py
new file mode 100644
index 0000000000..f491bc2523
--- /dev/null
+++ b/py/torch_tensorrt/dynamo/_engine_caching.py
@@ -0,0 +1,212 @@
+import ast
+import copy
+import logging
+import os
+import shutil
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, cast
+
+import torch
+from torch._inductor.codecache import FxGraphCachePickler
+from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+
+class BaseEngineCache(ABC):
+
+    @abstractmethod
+    def __init__(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        pass
+
+    @staticmethod
+    def get_hash(gm: torch.fx.GraphModule) -> str:
+        """Get the hash value of the GraphModule
+
+        Args:
+            gm (torch.fx.GraphModule): GraphModule to hash
+
+        Returns:
+            str: hash value of the GraphModule
+        """
+        # parameters are set to 0
+        with maybe_disable_fake_tensor_mode():
+            new_gm = copy.deepcopy(gm)
+            for name, param in new_gm.named_parameters():
+                param.data.zero_()
+
+            hash_val = cast(str, FxGraphCachePickler.get_hash(new_gm))
+
+        return hash_val
+
+    @abstractmethod
+    def save(
+        self,
+        hash: str,
+        serialized_engine: bytes,
+        input_names: List[str],
+        output_names: List[str],
+    ) -> bool:
+        """Save the serialized engine to hard disk
+
+        Args:
+            hash (str): hash value of the GraphModule
+            serialized_engine (bytes): serialized TRT engine
+            input_names (List[str]): input names of TRT engine
+            output_names (List[str]): output names of TRT engine
+
+        Returns:
+            bool: whether the serialized engine is saved successfully
+        """
+        pass
+
+    @abstractmethod
+    def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]:
+        """Load the serialized engine from hard disk
+
+        Args:
+            hash (str): hash value of the GraphModule
+
+        Returns:
+            Sequence[Optional[bytes], List[str], List[str]]: serialized TRT engine, input names of TRT Engine, output names of TRT Engine
+        """
+        pass
+
+
+class EngineCache(BaseEngineCache):
+
+    def __init__(
+        self,
+        engine_cache_size: int,
+        engine_cache_dir: str,
+    ) -> None:
+        self.total_engine_cache_size = engine_cache_size
+        self.available_engine_cache_size = engine_cache_size
+        self.engine_cache_dir = engine_cache_dir
+        self.hash2size_map: Dict[str, int] = {}
+
+    def has_available_cache_size(self, serialized_engine: bytes) -> bool:
+        """Check if the cache has available space for saving the serialized engine
+
+        Args:
+            serialized_engine (bytes): serialized TRT engine
+
+        Returns:
+            bool: whether the cache has available size for the serialized engine
+        """
+        return int(serialized_engine.nbytes) <= self.available_engine_cache_size
+
+    def clear_cache(self, needed_min_size: int) -> bool:
+        """Clear the cache to make sure at least `needed_min_size` bytes are available, if possible
+
+        Args:
+            needed_min_size (int): the minimum needed size
+
+        Returns:
+            bool: whether the cache is cleared successfully
+        """
+
+        def LRU() -> bool:
+            """Clear the Least Recently Used engine in the cache"""
+            # Get the list of engine directories
+            engines_hash_values = os.listdir(self.engine_cache_dir)
+            # Sort the engine directories by modification time (oldest first)
+            engines_hash_values.sort(
+                key=lambda x: os.path.getmtime(os.path.join(self.engine_cache_dir, x))
+            )
+            # Iterate over the engine directories and remove the oldest ones until enough space is available
+            for engine_hash in engines_hash_values:
+                if self.available_engine_cache_size >= needed_min_size:
+                    break
+                engine_path = os.path.join(self.engine_cache_dir, engine_hash)
+                try:
+                    # Remove the entire directory
+                    shutil.rmtree(engine_path)
+                    # Update the available cache size
+                    self.available_engine_cache_size += self.hash2size_map.pop(
+                        engine_hash, 0
+                    )
+                    _LOGGER.info(
+                        f"Removed the engine cache at {engine_path}, available cache size: {self.available_engine_cache_size} bytes."
+                    )
+                except Exception as e:
+                    _LOGGER.warning(
+                        f"Failed to clear the engine cache at {engine_path}: {e}"
+                    )
+                    return False
+            return True
+
+        if not os.path.exists(self.engine_cache_dir):
+            return False
+
+        _LOGGER.info(
+            f"Total cache size: {self.total_engine_cache_size} bytes; available cache size: {self.available_engine_cache_size} bytes. Clearing the cache to make sure at least {needed_min_size} bytes are available."
+        )
+        return LRU()
+
+    def save(
+        self,
+        hash: str,
+        serialized_engine: bytes,
+        input_names: List[str],
+        output_names: List[str],
+    ) -> bool:
+        serialized_engine_size = int(serialized_engine.nbytes)
+        if serialized_engine_size > self.total_engine_cache_size:
+            _LOGGER.warning(
+                f"The serialized engine cannot be saved because the size of the engine {serialized_engine_size} is larger than the total cache size {self.total_engine_cache_size}."
+            )
+            return False
+
+        # Check if there is enough available cache size for the serialized engine
+        if not self.has_available_cache_size(serialized_engine):
+            self.clear_cache(serialized_engine_size)
+
+        # Save the serialized engine to the cache directory
+        if self.has_available_cache_size(serialized_engine):
+            path = os.path.join(
+                self.engine_cache_dir,
+                f"{hash}/engine--{input_names}--{output_names}.trt",
+            )
+            try:
+                os.makedirs(os.path.dirname(path), exist_ok=True)
+                with open(path, "wb") as f:
+                    f.write(serialized_engine)
+                self.hash2size_map[hash] = serialized_engine_size
+                self.available_engine_cache_size -= serialized_engine_size
+                _LOGGER.info(f"A TRT engine was cached to {path}")
+
+            except Exception as e:
+                _LOGGER.warning(f"Failed to save the TRT engine to {path}: {e}")
+                return False
+
+            return True
+
+        else:
+            _LOGGER.warning(
+                f"The serialized engine {serialized_engine_size} is still larger than the available cache size {self.available_engine_cache_size}."
+            )
+            return False
+
+    def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]:
+        directory = os.path.join(self.engine_cache_dir, hash)
+        if os.path.exists(directory):
+            engine_list = os.listdir(directory)
+            assert (
+                len(engine_list) == 1
+            ), f"There are more than one engine {engine_list} under {directory}."
+            path = os.path.join(directory, engine_list[0])
+            input_names_str, output_names_str = (
+                engine_list[0].split(".trt")[0].split("--")[1:]
+            )
+            input_names = ast.literal_eval(input_names_str)
+            output_names = ast.literal_eval(output_names_str)
+            with open(path, "rb") as f:
+                serialized_engine = f.read()
+                return serialized_engine, input_names, output_names
+        else:
+            return None, [], []
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
index 4a9792d3dc..90c17d03c3 100644
--- a/py/torch_tensorrt/dynamo/_settings.py
+++ b/py/torch_tensorrt/dynamo/_settings.py
@@ -14,9 +14,13 @@
     DRYRUN,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
     ENABLED_PRECISIONS,
+    ENGINE_CACHE_DIR,
+    ENGINE_CACHE_INSTANCE,
+    ENGINE_CACHE_SIZE,
     ENGINE_CAPABILITY,
     HARDWARE_COMPATIBLE,
     LAZY_ENGINE_INIT,
+    LOAD_ENGINE_CACHE,
     MAKE_REFITABLE,
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
@@ -24,6 +28,7 @@
     OPTIMIZATION_LEVEL,
     PASS_THROUGH_BUILD_FAILURES,
     REQUIRE_FULL_COMPILATION,
+    SAVE_ENGINE_CACHE,
     SPARSE_WEIGHTS,
     TIMING_CACHE_PATH,
     TRUNCATE_DOUBLE,
@@ -33,6 +38,7 @@
     WORKSPACE_SIZE,
     default_device,
 )
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache
 
 
 @dataclass
@@ -74,6 +80,11 @@ class CompilationSettings:
             output to a file if a string path is specified
         hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
+        save_engine_cache (bool): Whether to save the compiled TRT engines to hard disk
+        load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk
+        engine_cache_dir (str): Directory to store the cached TRT engines
+        engine_cache_size (int): Maximum hard-disk space to use for the engine cache
+        engine_cache_instance (BaseEngineCache): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -106,3 +117,8 @@ class CompilationSettings:
     hardware_compatible: bool = HARDWARE_COMPATIBLE
     timing_cache_path: str = TIMING_CACHE_PATH
     lazy_engine_init: bool = LAZY_ENGINE_INIT
+    save_engine_cache: bool = SAVE_ENGINE_CACHE
+    load_engine_cache: bool = LOAD_ENGINE_CACHE
+    engine_cache_dir: str = ENGINE_CACHE_DIR
+    engine_cache_size: int = ENGINE_CACHE_SIZE
+    engine_cache_instance: BaseEngineCache = ENGINE_CACHE_INSTANCE
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
index ae3cb38f2d..e34f37a9b8 100644
--- a/py/torch_tensorrt/dynamo/backend/backends.py
+++ b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -98,7 +98,7 @@ def _pretraced_backend(
 
             logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph))
 
-            gm = post_lowering(gm)
+            gm = post_lowering(gm, sample_inputs)
 
             logger.debug("Lowered Input graph:\n " + str(gm.graph))
 
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index 9fef61961b..ccb602e8dc 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -323,6 +323,7 @@ def _save_timing_cache(
         This is called after a TensorRT engine is built. Save the timing cache
         """
         timing_cache = builder_config.get_timing_cache()
+        os.makedirs(os.path.dirname(timing_cache_path), exist_ok=True)
         with open(timing_cache_path, "wb") as timing_cache_file:
             timing_cache_file.write(memoryview(timing_cache.serialize()))
 
@@ -516,15 +517,50 @@ def run(
         Args:
             strict_type_constraints: Usually we should set it to False unless we want to control the precision of certain layer for numeric reasons.
             algorithm_selector: set up algorithm selection for certain layer
+            tactic_sources: set up tactic sources for certain layer
         Return:
             TRTInterpreterResult
         """
+        if (
+            self.compilation_settings.save_engine_cache
+            or self.compilation_settings.load_engine_cache
+        ):
+            engine_cache = self.compilation_settings.engine_cache_instance
+            hash_val = engine_cache.get_hash(self.module)
+
+        if self.compilation_settings.load_engine_cache:
+            # query the cached TRT engine
+            serialized_engine, input_names, output_names = engine_cache.load(hash_val)
+            if serialized_engine is not None:
+                self._input_names = input_names
+                self._output_names = output_names
+                _LOGGER.info(
+                    "Hit the cached TRT engine. It is loaded for skipping recompilation."
+                )
+
+                # refit the engine
+                from torch_tensorrt.dynamo._refit import (
+                    _refit_single_trt_engine_with_gm,
+                )
+
+                runtime = trt.Runtime(TRT_LOGGER)
+                engine = runtime.deserialize_cuda_engine(serialized_engine)
+                _refit_single_trt_engine_with_gm(
+                    self.module, engine, self.input_specs, self.compilation_settings
+                )
+                _LOGGER.info("Refitting Succeed!")
+
+                return TRTInterpreterResult(
+                    serialized_engine, self._input_names, self._output_names
+                )
+
         self._construct_trt_network_def()
 
         if self.compilation_settings.make_refitable:
             self._save_weight_mapping()
 
         build_engine_start_time = datetime.now()
+        _LOGGER.info("Not found cached TRT engines. Start building engine.")
 
         builder_config = self._populate_trt_builder_config(
             strict_type_constraints, algorithm_selector, tactic_sources
@@ -547,6 +583,10 @@ def run(
         self._save_timing_cache(
             builder_config, self.compilation_settings.timing_cache_path
         )
+        if self.compilation_settings.save_engine_cache:
+            engine_cache.save(
+                hash_val, serialized_engine, self._input_names, self._output_names
+            )
 
         with io.BytesIO() as engine_bytes:
             engine_bytes.write(serialized_engine)

From f3d10848e29b4b185edaec61a6ff562f21a18f20 Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Wed, 7 Aug 2024 11:45:12 -0700
Subject: [PATCH 02/14] rebase

---
 py/torch_tensorrt/dynamo/backend/backends.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
index e34f37a9b8..ae3cb38f2d 100644
--- a/py/torch_tensorrt/dynamo/backend/backends.py
+++ b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -98,7 +98,7 @@ def _pretraced_backend(
 
             logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph))
 
-            gm = post_lowering(gm, sample_inputs)
+            gm = post_lowering(gm)
 
             logger.debug("Lowered Input graph:\n " + str(gm.graph))
 

From 1e5b501709a852e18eb0b114439d966576cd26a6 Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Thu, 8 Aug 2024 18:25:45 -0700
Subject: [PATCH 03/14] add comments

---
 examples/dynamo/engine_caching_example.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
index a7b8f02f7a..1bfbb4dd44 100644
--- a/examples/dynamo/engine_caching_example.py
+++ b/examples/dynamo/engine_caching_example.py
@@ -41,6 +41,10 @@ def dynamo_path(iterations=3):
         model, args=example_inputs, dynamic_shapes={"x": {0: batch}}
     )
 
+    # The 1st iteration is to measure the compilation time without engine caching
+    # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+    # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+    # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
     for i in range(iterations):
         inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
         remove_timing_cache()  # remove timing cache for engine caching messurement
@@ -133,6 +137,10 @@ def compile_path(iterations=3):
     start = torch.cuda.Event(enable_timing=True)
     end = torch.cuda.Event(enable_timing=True)
 
+    # The 1st iteration is to measure the compilation time without engine caching
+    # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+    # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+    # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
     for i in range(iterations):
         inputs = [torch.rand(size).to("cuda")]
         # remove timing cache and reset dynamo for engine caching messurement

From bc0a8c055953a8b7a04092491dbc81e10cd3b9ad Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Mon, 12 Aug 2024 14:08:55 -0700
Subject: [PATCH 04/14] add bert example

---
 .../dynamo/engine_caching_bert_example.py     | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 examples/dynamo/engine_caching_bert_example.py

diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py
new file mode 100644
index 0000000000..f4635f5f5d
--- /dev/null
+++ b/examples/dynamo/engine_caching_bert_example.py
@@ -0,0 +1,64 @@
+import numpy as np
+import torch
+import torch_tensorrt
+from engine_caching_example import remove_timing_cache
+from transformers import BertModel
+
+np.random.seed(0)
+torch.manual_seed(0)
+
+model = BertModel.from_pretrained("bert-base-uncased", return_dict=False).cuda().eval()
+inputs = [
+    torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),
+    torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),
+]
+
+
+def compile_bert(iterations=3):
+    times = []
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    # The 1st iteration is to measure the compilation time without engine caching
+    # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+    # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+    # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
+    for i in range(iterations):
+        # remove timing cache and reset dynamo for engine caching messurement
+        remove_timing_cache()
+        torch._dynamo.reset()
+
+        if i == 0:
+            save_engine_cache = False
+            load_engine_cache = False
+        else:
+            save_engine_cache = True
+            load_engine_cache = True
+
+        start.record()
+        compilation_kwargs = {
+            "use_python_runtime": False,
+            "enabled_precisions": {torch.float},
+            "truncate_double": True,
+            "debug": True,
+            "min_block_size": 1,
+            "make_refitable": True,
+            "save_engine_cache": save_engine_cache,
+            "load_engine_cache": load_engine_cache,
+            "engine_cache_size": 1 << 30,  # 1GB
+        }
+        optimized_model = torch.compile(
+            model,
+            backend="torch_tensorrt",
+            options=compilation_kwargs,
+        )
+        optimized_model(*inputs)
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+
+    print("-----compile bert-----> compilation time:", times, "milliseconds")
+
+
+if __name__ == "__main__":
+    compile_bert()

From 79544e39d9b1e5ffbcd747a046fc17c018a1ed5e Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Wed, 14 Aug 2024 02:43:14 -0700
Subject: [PATCH 05/14] support saving weight name map

---
 .../dynamo/engine_caching_bert_example.py     |   4 +-
 examples/dynamo/engine_caching_example.py     |   5 +-
 py/torch_tensorrt/dynamo/_engine_caching.py   | 124 +++++++++++++-----
 .../dynamo/conversion/_TRTInterpreter.py      |  29 ++--
 4 files changed, 107 insertions(+), 55 deletions(-)

diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py
index f4635f5f5d..2f133f5e8f 100644
--- a/examples/dynamo/engine_caching_bert_example.py
+++ b/examples/dynamo/engine_caching_bert_example.py
@@ -40,7 +40,7 @@ def compile_bert(iterations=3):
             "use_python_runtime": False,
             "enabled_precisions": {torch.float},
             "truncate_double": True,
-            "debug": True,
+            "debug": False,
             "min_block_size": 1,
             "make_refitable": True,
             "save_engine_cache": save_engine_cache,
@@ -57,7 +57,7 @@ def compile_bert(iterations=3):
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
 
-    print("-----compile bert-----> compilation time:", times, "milliseconds")
+    print("-----compile bert-----> compilation time:\n", times, "milliseconds")
 
 
 if __name__ == "__main__":
diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
index 1bfbb4dd44..80cf696466 100644
--- a/examples/dynamo/engine_caching_example.py
+++ b/examples/dynamo/engine_caching_example.py
@@ -72,12 +72,11 @@ def dynamo_path(iterations=3):
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
 
-    print("-----dynamo_path-----> compilation time:", times, "milliseconds")
+    print("-----dynamo_path-----> compilation time:\n", times, "milliseconds")
 
 
 # Custom Engine Cache
 class MyEngineCache(BaseEngineCache):
-
     def __init__(
         self,
         engine_cache_size: int,
@@ -174,7 +173,7 @@ def compile_path(iterations=3):
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
 
-    print("-----compile_path-----> compilation time:", times, "milliseconds")
+    print("-----compile_path-----> compilation time:\n", times, "milliseconds")
 
 
 if __name__ == "__main__":
diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py
index f491bc2523..f9b6f075eb 100644
--- a/py/torch_tensorrt/dynamo/_engine_caching.py
+++ b/py/torch_tensorrt/dynamo/_engine_caching.py
@@ -1,8 +1,9 @@
-import ast
 import copy
 import logging
 import os
+import pickle
 import shutil
+import sys
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Tuple, cast
 
@@ -50,6 +51,7 @@ def save(
         serialized_engine: bytes,
         input_names: List[str],
         output_names: List[str],
+        weight_name_map: Optional[Dict[str, Any]] = None,
     ) -> bool:
         """Save the serialized engine to hard disk
 
@@ -58,6 +60,7 @@ def save(
             serialized_engine (bytes): serialized TRT engine
             input_names (List[str]): input names of TRT engine
             output_names (List[str]): output names of TRT engine
+            weight_name_map (Optional[Dict[str, Any]]): weight name map for refitting
 
         Returns:
             bool: whether the serialized engine is saved successfully
@@ -65,14 +68,16 @@ def save(
         pass
 
     @abstractmethod
-    def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]:
+    def load(
+        self, hash: str
+    ) -> Tuple[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]:
         """Load the serialized engine from hard disk
 
         Args:
             hash (str): hash value of the GraphModule
 
         Returns:
-            Sequence[Optional[bytes], List[str], List[str]]: serialized TRT engine, input names of TRT Engine, output names of TRT Engine
+            Sequence[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]: serialized engine, input names, output names, weight name map
         """
         pass
 
@@ -89,16 +94,16 @@ def __init__(
         self.engine_cache_dir = engine_cache_dir
         self.hash2size_map: Dict[str, int] = {}
 
-    def has_available_cache_size(self, serialized_engine: bytes) -> bool:
+    def has_available_cache_size(self, needed_size: int) -> bool:
         """Check if the cache has available space for saving the serialized engine
 
         Args:
-            serialized_engine (bytes): serialized TRT engine
+            needed_size (int): needed size for erialized TRT engine and/or weight_name_map
 
         Returns:
             bool: whether the cache has available size for the serialized engine
         """
-        return int(serialized_engine.nbytes) <= self.available_engine_cache_size
+        return needed_size <= self.available_engine_cache_size
 
     def clear_cache(self, needed_min_size: int) -> bool:
         """Clear the cache to make sure at least `needed_min_size` bytes are available, if possible
@@ -154,36 +159,75 @@ def save(
         serialized_engine: bytes,
         input_names: List[str],
         output_names: List[str],
+        weight_name_map: Optional[Dict[str, Any]] = None,
     ) -> bool:
         serialized_engine_size = int(serialized_engine.nbytes)
+        if weight_name_map is not None:
+            serialized_engine_size += sum(
+                sys.getsizeof(v) for v in weight_name_map.values()
+            )
         if serialized_engine_size > self.total_engine_cache_size:
             _LOGGER.warning(
                 f"The serialized engine cannot be saved because the size of the engine {serialized_engine_size} is larger than the total cache size {self.total_engine_cache_size}."
             )
             return False
 
-        # Check if there is enough available cache size for the serialized engine
-        if not self.has_available_cache_size(serialized_engine):
+        # Check if there is enough available cache size for the serialized engine and/or weight_name_map
+        if not self.has_available_cache_size(serialized_engine_size):
             self.clear_cache(serialized_engine_size)
 
         # Save the serialized engine to the cache directory
-        if self.has_available_cache_size(serialized_engine):
-            path = os.path.join(
-                self.engine_cache_dir,
-                f"{hash}/engine--{input_names}--{output_names}.trt",
+        if self.has_available_cache_size(serialized_engine_size):
+            self.hash2size_map[hash] = serialized_engine_size
+            self.available_engine_cache_size -= serialized_engine_size
+            directory = os.path.join(self.engine_cache_dir, hash)
+
+            engine_path = os.path.join(
+                directory,
+                "engine.trt",
+            )
+            io_names_path = os.path.join(
+                directory,
+                "io_names.pkl",
             )
             try:
-                os.makedirs(os.path.dirname(path), exist_ok=True)
-                with open(path, "wb") as f:
+                os.makedirs(os.path.dirname(engine_path), exist_ok=True)
+                with open(engine_path, "wb") as f:
                     f.write(serialized_engine)
-                self.hash2size_map[hash] = serialized_engine_size
-                self.available_engine_cache_size -= serialized_engine_size
-                _LOGGER.info(f"A TRT engine was cached to {path}")
-
+                os.makedirs(os.path.dirname(io_names_path), exist_ok=True)
+                with open(io_names_path, "wb") as f:
+                    pickle.dump(
+                        {"input_names": input_names, "output_names": output_names}, f
+                    )
+                _LOGGER.info(f"The TRT engine was saved to {engine_path}")
             except Exception as e:
-                _LOGGER.warning(f"Failed to save the TRT engine to {path}: {e}")
+                del self.hash2size_map[hash]
+                self.available_engine_cache_size += serialized_engine_size
+                shutil.rmtree(directory)
+                _LOGGER.warning(f"Failed to save the TRT engine to {engine_path}: {e}")
                 return False
 
+            if weight_name_map is not None:
+                weight_name_map_path = os.path.join(
+                    directory,
+                    "weight_name_map.pkl",
+                )
+                try:
+                    os.makedirs(os.path.dirname(weight_name_map_path), exist_ok=True)
+                    with open(weight_name_map_path, "wb") as f:
+                        pickle.dump(weight_name_map, f)
+                    _LOGGER.info(
+                        f"The weight_name_map was saved to {weight_name_map_path}"
+                    )
+                except Exception as e:
+                    del self.hash2size_map[hash]
+                    self.available_engine_cache_size += serialized_engine_size
+                    shutil.rmtree(directory)
+                    _LOGGER.warning(
+                        f"Failed to save the weight_name_map to {weight_name_map_path}: {e}"
+                    )
+                    return False
+
             return True
 
         else:
@@ -192,21 +236,33 @@ def save(
             )
             return False
 
-    def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]:
+    def load(
+        self, hash: str
+    ) -> Tuple[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]:
         directory = os.path.join(self.engine_cache_dir, hash)
         if os.path.exists(directory):
-            engine_list = os.listdir(directory)
-            assert (
-                len(engine_list) == 1
-            ), f"There are more than one engine {engine_list} under {directory}."
-            path = os.path.join(directory, engine_list[0])
-            input_names_str, output_names_str = (
-                engine_list[0].split(".trt")[0].split("--")[1:]
-            )
-            input_names = ast.literal_eval(input_names_str)
-            output_names = ast.literal_eval(output_names_str)
-            with open(path, "rb") as f:
-                serialized_engine = f.read()
-                return serialized_engine, input_names, output_names
+            # load engine
+            serialized_engine = None
+            engine_path = os.path.join(directory, "engine.trt")
+            if os.path.exists(engine_path):
+                with open(engine_path, "rb") as f:
+                    serialized_engine = f.read()
+
+            input_names = []
+            output_names = []
+            io_names_path = os.path.join(directory, "io_names.pkl")
+            if os.path.exists(io_names_path):
+                with open(io_names_path, "rb") as f:
+                    io_names = pickle.load(f)
+                    input_names = io_names["input_names"]
+                    output_names = io_names["output_names"]
+
+            # load weight_name_map
+            weight_name_map = None
+            weight_name_map_path = os.path.join(directory, "weight_name_map.pkl")
+            if os.path.exists(weight_name_map_path):
+                with open(weight_name_map_path, "rb") as f:
+                    weight_name_map = pickle.load(f)
+            return serialized_engine, input_names, output_names, weight_name_map
         else:
-            return None, [], []
+            return None, [], [], {}
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index ccb602e8dc..cffcea5b9f 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -530,28 +530,21 @@ def run(
 
         if self.compilation_settings.load_engine_cache:
             # query the cached TRT engine
-            serialized_engine, input_names, output_names = engine_cache.load(hash_val)
+            serialized_engine, input_names, output_names, weight_name_map = (
+                engine_cache.load(hash_val)
+            )
             if serialized_engine is not None:
                 self._input_names = input_names
                 self._output_names = output_names
+                self.weight_name_map = weight_name_map
                 _LOGGER.info(
                     "Hit the cached TRT engine. It is loaded for skipping recompilation."
                 )
-
-                # refit the engine
-                from torch_tensorrt.dynamo._refit import (
-                    _refit_single_trt_engine_with_gm,
-                )
-
-                runtime = trt.Runtime(TRT_LOGGER)
-                engine = runtime.deserialize_cuda_engine(serialized_engine)
-                _refit_single_trt_engine_with_gm(
-                    self.module, engine, self.input_specs, self.compilation_settings
-                )
-                _LOGGER.info("Refitting Succeed!")
-
                 return TRTInterpreterResult(
-                    serialized_engine, self._input_names, self._output_names
+                    serialized_engine,
+                    self._input_names,
+                    self._output_names,
+                    self.weight_name_map,
                 )
 
         self._construct_trt_network_def()
@@ -585,7 +578,11 @@ def run(
         )
         if self.compilation_settings.save_engine_cache:
             engine_cache.save(
-                hash_val, serialized_engine, self._input_names, self._output_names
+                hash_val,
+                serialized_engine,
+                self._input_names,
+                self._output_names,
+                self.weight_name_map,
             )
 
         with io.BytesIO() as engine_bytes:

From a7b7676c3e6ba35c8c10c6800db601b2767285ff Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Tue, 20 Aug 2024 22:12:01 -0700
Subject: [PATCH 06/14] refactor

---
 .../dynamo/engine_caching_bert_example.py     |  13 +-
 examples/dynamo/engine_caching_example.py     |  89 +++----
 py/torch_tensorrt/dynamo/_compiler.py         |  47 +---
 py/torch_tensorrt/dynamo/_defaults.py         |   9 +-
 py/torch_tensorrt/dynamo/_engine_caching.py   | 245 ++++++++----------
 py/torch_tensorrt/dynamo/_settings.py         |  24 +-
 .../dynamo/conversion/_TRTInterpreter.py      |  62 +++--
 py/torch_tensorrt/dynamo/utils.py             |  16 ++
 8 files changed, 227 insertions(+), 278 deletions(-)

diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py
index 2f133f5e8f..43cfc5f15a 100644
--- a/examples/dynamo/engine_caching_bert_example.py
+++ b/examples/dynamo/engine_caching_bert_example.py
@@ -29,11 +29,11 @@ def compile_bert(iterations=3):
         torch._dynamo.reset()
 
         if i == 0:
-            save_engine_cache = False
-            load_engine_cache = False
+            cache_built_engines = False
+            reuse_cached_engines = False
         else:
-            save_engine_cache = True
-            load_engine_cache = True
+            cache_built_engines = True
+            reuse_cached_engines = True
 
         start.record()
         compilation_kwargs = {
@@ -43,8 +43,9 @@ def compile_bert(iterations=3):
             "debug": False,
             "min_block_size": 1,
             "make_refitable": True,
-            "save_engine_cache": save_engine_cache,
-            "load_engine_cache": load_engine_cache,
+            "cache_built_engines": cache_built_engines,
+            "reuse_cached_engines": reuse_cached_engines,
+            "engine_cache_dir": "/tmp/torch_trt_bert_engine_cache",
             "engine_cache_size": 1 << 30,  # 1GB
         }
         optimized_model = torch.compile(
diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
index 80cf696466..89912e74b0 100644
--- a/examples/dynamo/engine_caching_example.py
+++ b/examples/dynamo/engine_caching_example.py
@@ -1,7 +1,5 @@
-import ast
-import logging
 import os
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import numpy as np
 import torch
@@ -10,9 +8,6 @@
 from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH
 from torch_tensorrt.dynamo._engine_caching import BaseEngineCache
 
-_LOGGER: logging.Logger = logging.getLogger(__name__)
-
-
 np.random.seed(0)
 torch.manual_seed(0)
 size = (100, 3, 224, 224)
@@ -49,11 +44,11 @@ def dynamo_path(iterations=3):
         inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
         remove_timing_cache()  # remove timing cache for engine caching messurement
         if i == 0:
-            save_engine_cache = False
-            load_engine_cache = False
+            cache_built_engines = False
+            reuse_cached_engines = False
         else:
-            save_engine_cache = True
-            load_engine_cache = True
+            cache_built_engines = True
+            reuse_cached_engines = True
 
         start.record()
         trt_gm = torch_trt.dynamo.compile(
@@ -64,8 +59,8 @@ def dynamo_path(iterations=3):
             debug=debug,
             min_block_size=min_block_size,
             make_refitable=True,
-            save_engine_cache=save_engine_cache,
-            load_engine_cache=load_engine_cache,
+            cache_built_engines=cache_built_engines,
+            reuse_cached_engines=reuse_cached_engines,
             engine_cache_size=1 << 30,  # 1GB
         )
         end.record()
@@ -79,60 +74,36 @@ def dynamo_path(iterations=3):
 class MyEngineCache(BaseEngineCache):
     def __init__(
         self,
-        engine_cache_size: int,
         engine_cache_dir: str,
     ) -> None:
-        self.total_engine_cache_size = engine_cache_size
-        self.available_engine_cache_size = engine_cache_size
         self.engine_cache_dir = engine_cache_dir
 
     def save(
         self,
         hash: str,
-        serialized_engine: bytes,
-        input_names: List[str],
-        output_names: List[str],
-    ) -> bool:
+        blob: bytes,
+        prefix: str = "blob",
+    ):
         path = os.path.join(
             self.engine_cache_dir,
-            f"{hash}/engine--{input_names}--{output_names}.trt",
+            f"{prefix}_{hash}.bin",
         )
-        try:
-            os.makedirs(os.path.dirname(path), exist_ok=True)
-            with open(path, "wb") as f:
-                f.write(serialized_engine)
-        except Exception as e:
-            _LOGGER.warning(f"Failed to save the TRT engine to {path}: {e}")
-            return False
-
-        _LOGGER.info(f"A TRT engine was cached to {path}")
-        serialized_engine_size = int(serialized_engine.nbytes)
-        self.available_engine_cache_size -= serialized_engine_size
-        return True
-
-    def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]:
-        directory = os.path.join(self.engine_cache_dir, hash)
-        if os.path.exists(directory):
-            engine_list = os.listdir(directory)
-            assert (
-                len(engine_list) == 1
-            ), f"There are more than one engine {engine_list} under {directory}."
-            path = os.path.join(directory, engine_list[0])
-            input_names_str, output_names_str = (
-                engine_list[0].split(".trt")[0].split("--")[1:]
-            )
-            input_names = ast.literal_eval(input_names_str)
-            output_names = ast.literal_eval(output_names_str)
+        os.makedirs(path, exist_ok=True)
+        with open(path, "wb") as f:
+            f.write(blob)
+
+    def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]:
+        path = os.path.join(self.engine_cache_dir, f"{prefix}_{hash}.bin")
+        if os.path.exists(path):
             with open(path, "rb") as f:
-                serialized_engine = f.read()
-                return serialized_engine, input_names, output_names
-        else:
-            return None, [], []
+                blob = f.read()
+            return blob
+        return None
 
 
 def compile_path(iterations=3):
     times = []
-    engine_cache = MyEngineCache(200 * (1 << 20), "/tmp/your_dir")
+    engine_cache = MyEngineCache("/tmp/your_dir")
     start = torch.cuda.Event(enable_timing=True)
     end = torch.cuda.Event(enable_timing=True)
 
@@ -147,11 +118,11 @@ def compile_path(iterations=3):
         torch._dynamo.reset()
 
         if i == 0:
-            save_engine_cache = False
-            load_engine_cache = False
+            cache_built_engines = False
+            reuse_cached_engines = False
         else:
-            save_engine_cache = True
-            load_engine_cache = True
+            cache_built_engines = True
+            reuse_cached_engines = True
 
         start.record()
         compiled_model = torch.compile(
@@ -163,9 +134,9 @@ def compile_path(iterations=3):
                 "debug": debug,
                 "min_block_size": min_block_size,
                 "make_refitable": True,
-                "save_engine_cache": save_engine_cache,
-                "load_engine_cache": load_engine_cache,
-                "engine_cache_instance": engine_cache,  # use custom engine cache
+                "cache_built_engines": cache_built_engines,
+                "reuse_cached_engines": reuse_cached_engines,
+                "custom_engine_cache": engine_cache,  # use custom engine cache
             },
         )
         compiled_model(*inputs)  # trigger the compilation
@@ -178,4 +149,4 @@ def compile_path(iterations=3):
 
 if __name__ == "__main__":
     dynamo_path()
-    compile_path()
+    # compile_path()
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 229ecb5ef7..bc31592c06 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -18,7 +18,7 @@
     dryrun_stats_display,
     parse_non_trt_nodes,
 )
-from torch_tensorrt.dynamo._engine_caching import BaseEngineCache, EngineCache
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache, DiskEngineCache
 from torch_tensorrt.dynamo.conversion import (
     CompilationSettings,
     UnsupportedOperatorException,
@@ -83,11 +83,11 @@ def compile(
     hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE,
     timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
     lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT,
-    save_engine_cache: bool = _defaults.SAVE_ENGINE_CACHE,
-    load_engine_cache: bool = _defaults.LOAD_ENGINE_CACHE,
+    cache_built_engines: bool = _defaults.CACHE_BUILT_ENGINES,
+    reuse_cached_engines: bool = _defaults.REUSE_CACHED_ENGINES,
     engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR,
     engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE,
-    engine_cache_instance: Optional[BaseEngineCache] = None,
+    custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -153,11 +153,11 @@ def compile(
         hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
         lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
-        save_engine_cache (bool): Whether to save the compiled TRT engines to hard disk
-        load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk
+        cache_built_engines (bool): Whether to save the compiled TRT engines to storage
+        reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
         engine_cache_dir (str): Directory to store the cached TRT engines
         engine_cache_size (int): Maximum hard-disk space to use for the engine cache
-        engine_cache_instance (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache
+        custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -235,10 +235,9 @@ def compile(
     gm = post_lowering(gm)
     logger.debug("Lowered Input graph: " + str(gm.graph))
 
-    if engine_cache_instance is None:
-        engine_cache_instance = EngineCacheInstanceCreator.get_creator(
-            engine_cache_size, engine_cache_dir
-        ).engine_cache_instance
+    if cache_built_engines or reuse_cached_engines:
+        if custom_engine_cache is None:
+            custom_engine_cache = DiskEngineCache(engine_cache_dir, engine_cache_size)
 
     compilation_options = {
         "enabled_precisions": (
@@ -273,11 +272,9 @@ def compile(
         "hardware_compatible": hardware_compatible,
         "timing_cache_path": timing_cache_path,
         "lazy_engine_init": lazy_engine_init,
-        "save_engine_cache": save_engine_cache,
-        "load_engine_cache": load_engine_cache,
-        "engine_cache_dir": engine_cache_dir,
-        "engine_cache_size": engine_cache_size,
-        "engine_cache_instance": engine_cache_instance,
+        "cache_built_engines": cache_built_engines,
+        "reuse_cached_engines": reuse_cached_engines,
+        "custom_engine_cache": custom_engine_cache,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -686,21 +683,3 @@ def convert_exported_program_to_serialized_trt_engine(
 
     serialized_engine: bytes = interpreter_result.serialized_engine
     return serialized_engine
-
-
-class EngineCacheInstanceCreator:
-    engine_cache_creator = None
-
-    def __init__(self, engine_cache_size: int, engine_cache_dir: str) -> None:
-        self.engine_cache_instance = EngineCache(
-            engine_cache_size=engine_cache_size,
-            engine_cache_dir=engine_cache_dir,
-        )
-
-    @classmethod
-    def get_creator(
-        cls, engine_cache_size: int, engine_cache_dir: str
-    ) -> EngineCacheInstanceCreator:
-        if cls.engine_cache_creator is None:
-            cls.engine_cache_creator = cls(engine_cache_size, engine_cache_dir)
-        return cls.engine_cache_creator
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
index e90f3f8c2a..83e85cb3c7 100644
--- a/py/torch_tensorrt/dynamo/_defaults.py
+++ b/py/torch_tensorrt/dynamo/_defaults.py
@@ -4,7 +4,6 @@
 import torch
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import EngineCapability, dtype
-from torch_tensorrt.dynamo._engine_caching import EngineCache
 
 ENABLED_PRECISIONS = {dtype.f32}
 DEBUG = False
@@ -36,13 +35,11 @@
     tempfile.gettempdir(), "torch_tensorrt_engine_cache", "timing_cache.bin"
 )
 LAZY_ENGINE_INIT = False
-SAVE_ENGINE_CACHE = True
-LOAD_ENGINE_CACHE = True
+CACHE_BUILT_ENGINES = True
+REUSE_CACHED_ENGINES = True
 ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache")
 ENGINE_CACHE_SIZE = 1073741824
-ENGINE_CACHE_INSTANCE = EngineCache(
-    engine_cache_size=ENGINE_CACHE_SIZE, engine_cache_dir=ENGINE_CACHE_DIR
-)
+CUSTOM_ENGINE_CACHE = None
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py
index f9b6f075eb..01220233ea 100644
--- a/py/torch_tensorrt/dynamo/_engine_caching.py
+++ b/py/torch_tensorrt/dynamo/_engine_caching.py
@@ -3,7 +3,6 @@
 import os
 import pickle
 import shutil
-import sys
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Tuple, cast
 
@@ -44,78 +43,126 @@ def get_hash(gm: torch.fx.GraphModule) -> str:
 
         return hash_val
 
-    @abstractmethod
-    def save(
-        self,
-        hash: str,
+    @staticmethod
+    def pack(
         serialized_engine: bytes,
         input_names: List[str],
         output_names: List[str],
-        weight_name_map: Optional[Dict[str, Any]] = None,
-    ) -> bool:
-        """Save the serialized engine to hard disk
+        weight_name_map: Optional[Dict[str, Any]],
+    ) -> bytes:
+        """Pack serialized engine, input names, output names, and weight map into a single blob
 
         Args:
-            hash (str): hash value of the GraphModule
             serialized_engine (bytes): serialized TRT engine
             input_names (List[str]): input names of TRT engine
             output_names (List[str]): output names of TRT engine
             weight_name_map (Optional[Dict[str, Any]]): weight name map for refitting
 
         Returns:
-            bool: whether the serialized engine is saved successfully
+            bytes: packed blob
+        """
+        return pickle.dumps(
+            {
+                "serialized_engine": bytes(serialized_engine),
+                "input_names": input_names,
+                "output_names": output_names,
+                "weight_name_map": weight_name_map,
+            }
+        )
+
+    @staticmethod
+    def unpack(
+        packed_obj: bytes,
+    ) -> Tuple[bytes, List[str], List[str], Optional[Dict[str, Any]]]:
+        """Unpack packed blob into serialized engine, input names, output names, and weight map
+
+        Args:
+            packed_obj (bytes): packed blob
+
+        Returns:
+            Tuple[bytes, List[str], List[str], Optional[Dict[str, Any]]]: serialized engine, input names, output names, weight name map
+        """
+        unpacked = pickle.loads(packed_obj)
+        return (
+            unpacked["serialized_engine"],
+            unpacked["input_names"],
+            unpacked["output_names"],
+            unpacked["weight_name_map"],
+        )
+
+    @abstractmethod
+    def save(self, hash: str, blob: bytes, *args: Any, **kwargs: Any) -> None:
+        """Store blob in cache
+
+        Args:
+            hash (str): hash value of the GraphModule
+            blob (bytes): packed blob
         """
         pass
 
     @abstractmethod
-    def load(
-        self, hash: str
-    ) -> Tuple[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]:
-        """Load the serialized engine from hard disk
+    def load(self, hash: str, *args: Any, **kwargs: Any) -> Optional[bytes]:
+        """Load blob from storage
 
         Args:
             hash (str): hash value of the GraphModule
 
         Returns:
-            Sequence[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]: serialized engine, input names, output names, weight name map
+            Optional[bytes]: blob or None if doesn't hit
         """
         pass
 
 
-class EngineCache(BaseEngineCache):
+class DiskEngineCache(BaseEngineCache):
+    dir2hash2size_map: Dict[str, Dict[str, int]] = (
+        {}
+    )  # dir2hash2size_map["engine_cache_dir"]["hash"] = size
 
     def __init__(
         self,
-        engine_cache_size: int,
         engine_cache_dir: str,
+        engine_cache_size: int,
     ) -> None:
-        self.total_engine_cache_size = engine_cache_size
-        self.available_engine_cache_size = engine_cache_size
+
+        def get_dir_size(path: str) -> int:
+            total = 0
+            with os.scandir(path) as it:
+                for entry in it:
+                    if entry.is_file():
+                        total += entry.stat().st_size
+                    elif entry.is_dir():
+                        total += get_dir_size(entry.path)
+            return total
+
+        if not os.path.exists(engine_cache_dir):
+            os.makedirs(engine_cache_dir, exist_ok=True)
         self.engine_cache_dir = engine_cache_dir
-        self.hash2size_map: Dict[str, int] = {}
+        self.total_engine_cache_size = engine_cache_size
+        self.available_engine_cache_size = engine_cache_size - get_dir_size(
+            engine_cache_dir
+        )
+        if engine_cache_dir not in DiskEngineCache.dir2hash2size_map:
+            DiskEngineCache.dir2hash2size_map[engine_cache_dir] = {}
 
     def has_available_cache_size(self, needed_size: int) -> bool:
-        """Check if the cache has available space for saving the serialized engine
+        """Check if the cache has available space for saving object
 
         Args:
-            needed_size (int): needed size for erialized TRT engine and/or weight_name_map
+            needed_size (int): needed size for saving object
 
         Returns:
-            bool: whether the cache has available size for the serialized engine
+            bool: whether the cache has available size for saving object
         """
         return needed_size <= self.available_engine_cache_size
 
-    def clear_cache(self, needed_min_size: int) -> bool:
+    def clear_cache(self, needed_min_size: int) -> None:
         """Clear the cache to make sure at least `needed_min_size` bytes are available, if possible
 
         Args:
             needed_min_size (int): the minimum needed size
-
-        Returns:
-            bool: whether the cache is cleared successfully
         """
 
-        def LRU() -> bool:
+        def LRU() -> None:
             """Clear the Least Recently Used engine in the cache"""
             # Get the list of engine directories
             engines_hash_values = os.listdir(self.engine_cache_dir)
@@ -132,8 +179,10 @@ def LRU() -> bool:
                     # Remove the entire directory
                     shutil.rmtree(engine_path)
                     # Update the available cache size
-                    self.available_engine_cache_size += self.hash2size_map.pop(
-                        engine_hash, 0
+                    self.available_engine_cache_size += (
+                        DiskEngineCache.dir2hash2size_map[self.engine_cache_dir].pop(
+                            engine_hash, 0
+                        )
                     )
                     _LOGGER.info(
                         f"Removed the engine cache at {engine_path}, available cache size: {self.available_engine_cache_size} bytes."
@@ -142,127 +191,61 @@ def LRU() -> bool:
                     _LOGGER.warning(
                         f"Failed to clear the engine cache at {engine_path}: {e}"
                     )
-                    return False
-            return True
 
-        if not os.path.exists(self.engine_cache_dir):
-            return False
-
-        _LOGGER.info(
-            f"Total cache size: {self.total_engine_cache_size} bytes; available cache size: {self.available_engine_cache_size} bytes. Clearing the cache to make sure at least {needed_min_size} bytes are available."
-        )
-        return LRU()
+        if needed_min_size > self.total_engine_cache_size:
+            _LOGGER.warning(
+                f"The needed minimum size {needed_min_size} is larger than the total cache size {self.total_engine_cache_size}. Nothing will be cleared."
+            )
+        else:
+            LRU()
 
     def save(
         self,
         hash: str,
-        serialized_engine: bytes,
-        input_names: List[str],
-        output_names: List[str],
-        weight_name_map: Optional[Dict[str, Any]] = None,
-    ) -> bool:
-        serialized_engine_size = int(serialized_engine.nbytes)
-        if weight_name_map is not None:
-            serialized_engine_size += sum(
-                sys.getsizeof(v) for v in weight_name_map.values()
-            )
-        if serialized_engine_size > self.total_engine_cache_size:
+        blob: bytes,
+    ) -> None:
+        blob_size = len(blob)
+        if blob_size > self.total_engine_cache_size:
             _LOGGER.warning(
-                f"The serialized engine cannot be saved because the size of the engine {serialized_engine_size} is larger than the total cache size {self.total_engine_cache_size}."
+                f"The serialized engine cannot be saved because the size {blob_size} is larger than the total cache size {self.total_engine_cache_size}."
             )
-            return False
+            return
 
-        # Check if there is enough available cache size for the serialized engine and/or weight_name_map
-        if not self.has_available_cache_size(serialized_engine_size):
-            self.clear_cache(serialized_engine_size)
+        if not self.has_available_cache_size(blob_size):
+            self.clear_cache(blob_size)
 
-        # Save the serialized engine to the cache directory
-        if self.has_available_cache_size(serialized_engine_size):
-            self.hash2size_map[hash] = serialized_engine_size
-            self.available_engine_cache_size -= serialized_engine_size
+        if self.has_available_cache_size(blob_size):
+            DiskEngineCache.dir2hash2size_map[self.engine_cache_dir][hash] = blob_size
+            self.available_engine_cache_size -= blob_size
             directory = os.path.join(self.engine_cache_dir, hash)
+            if not os.path.exists(directory):
+                os.makedirs(directory, exist_ok=True)
 
-            engine_path = os.path.join(
-                directory,
-                "engine.trt",
-            )
-            io_names_path = os.path.join(
+            blob_path = os.path.join(
                 directory,
-                "io_names.pkl",
+                "blob.bin",
             )
             try:
-                os.makedirs(os.path.dirname(engine_path), exist_ok=True)
-                with open(engine_path, "wb") as f:
-                    f.write(serialized_engine)
-                os.makedirs(os.path.dirname(io_names_path), exist_ok=True)
-                with open(io_names_path, "wb") as f:
-                    pickle.dump(
-                        {"input_names": input_names, "output_names": output_names}, f
-                    )
-                _LOGGER.info(f"The TRT engine was saved to {engine_path}")
+                with open(blob_path, "wb") as f:
+                    f.write(blob)
+                _LOGGER.info(f"The blob was saved to {blob_path}")
             except Exception as e:
-                del self.hash2size_map[hash]
-                self.available_engine_cache_size += serialized_engine_size
+                del DiskEngineCache.dir2hash2size_map[self.engine_cache_dir][hash]
+                self.available_engine_cache_size += blob_size
                 shutil.rmtree(directory)
-                _LOGGER.warning(f"Failed to save the TRT engine to {engine_path}: {e}")
-                return False
-
-            if weight_name_map is not None:
-                weight_name_map_path = os.path.join(
-                    directory,
-                    "weight_name_map.pkl",
-                )
-                try:
-                    os.makedirs(os.path.dirname(weight_name_map_path), exist_ok=True)
-                    with open(weight_name_map_path, "wb") as f:
-                        pickle.dump(weight_name_map, f)
-                    _LOGGER.info(
-                        f"The weight_name_map was saved to {weight_name_map_path}"
-                    )
-                except Exception as e:
-                    del self.hash2size_map[hash]
-                    self.available_engine_cache_size += serialized_engine_size
-                    shutil.rmtree(directory)
-                    _LOGGER.warning(
-                        f"Failed to save the weight_name_map to {weight_name_map_path}: {e}"
-                    )
-                    return False
-
-            return True
+                _LOGGER.warning(f"Failed to save the blob to {blob_path}: {e}")
 
         else:
             _LOGGER.warning(
-                f"The serialized engine {serialized_engine_size} is still larger than the available cache size {self.available_engine_cache_size}."
+                f"The size {blob_size} is still larger than the available cache size {self.available_engine_cache_size}."
             )
-            return False
 
-    def load(
-        self, hash: str
-    ) -> Tuple[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]:
+    def load(self, hash: str) -> Optional[bytes]:
         directory = os.path.join(self.engine_cache_dir, hash)
         if os.path.exists(directory):
-            # load engine
-            serialized_engine = None
-            engine_path = os.path.join(directory, "engine.trt")
-            if os.path.exists(engine_path):
-                with open(engine_path, "rb") as f:
-                    serialized_engine = f.read()
-
-            input_names = []
-            output_names = []
-            io_names_path = os.path.join(directory, "io_names.pkl")
-            if os.path.exists(io_names_path):
-                with open(io_names_path, "rb") as f:
-                    io_names = pickle.load(f)
-                    input_names = io_names["input_names"]
-                    output_names = io_names["output_names"]
-
-            # load weight_name_map
-            weight_name_map = None
-            weight_name_map_path = os.path.join(directory, "weight_name_map.pkl")
-            if os.path.exists(weight_name_map_path):
-                with open(weight_name_map_path, "rb") as f:
-                    weight_name_map = pickle.load(f)
-            return serialized_engine, input_names, output_names, weight_name_map
-        else:
-            return None, [], [], {}
+            blob_path = os.path.join(directory, "blob.bin")
+            if os.path.exists(blob_path):
+                with open(blob_path, "rb") as f:
+                    blob = f.read()
+                return blob
+        return None
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
index 90c17d03c3..0327727c9f 100644
--- a/py/torch_tensorrt/dynamo/_settings.py
+++ b/py/torch_tensorrt/dynamo/_settings.py
@@ -6,6 +6,8 @@
 from torch_tensorrt._enums import EngineCapability, dtype
 from torch_tensorrt.dynamo._defaults import (
     ASSUME_DYNAMIC_SHAPE_SUPPORT,
+    CACHE_BUILT_ENGINES,
+    CUSTOM_ENGINE_CACHE,
     DEBUG,
     DISABLE_TF32,
     DLA_GLOBAL_DRAM_SIZE,
@@ -14,13 +16,9 @@
     DRYRUN,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
     ENABLED_PRECISIONS,
-    ENGINE_CACHE_DIR,
-    ENGINE_CACHE_INSTANCE,
-    ENGINE_CACHE_SIZE,
     ENGINE_CAPABILITY,
     HARDWARE_COMPATIBLE,
     LAZY_ENGINE_INIT,
-    LOAD_ENGINE_CACHE,
     MAKE_REFITABLE,
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
@@ -28,7 +26,7 @@
     OPTIMIZATION_LEVEL,
     PASS_THROUGH_BUILD_FAILURES,
     REQUIRE_FULL_COMPILATION,
-    SAVE_ENGINE_CACHE,
+    REUSE_CACHED_ENGINES,
     SPARSE_WEIGHTS,
     TIMING_CACHE_PATH,
     TRUNCATE_DOUBLE,
@@ -80,11 +78,9 @@ class CompilationSettings:
             output to a file if a string path is specified
         hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
-        save_engine_cache (bool): Whether to save the compiled TRT engines to hard disk
-        load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk
-        engine_cache_dir (str): Directory to store the cached TRT engines
-        engine_cache_size (int): Maximum hard-disk space to use for the engine cache
-        engine_cache_instance (BaseEngineCache): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache
+        cache_built_engines (bool): Whether to save the compiled TRT engines to storage
+        reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
+        custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -117,8 +113,6 @@ class CompilationSettings:
     hardware_compatible: bool = HARDWARE_COMPATIBLE
     timing_cache_path: str = TIMING_CACHE_PATH
     lazy_engine_init: bool = LAZY_ENGINE_INIT
-    save_engine_cache: bool = SAVE_ENGINE_CACHE
-    load_engine_cache: bool = LOAD_ENGINE_CACHE
-    engine_cache_dir: str = ENGINE_CACHE_DIR
-    engine_cache_size: int = ENGINE_CACHE_SIZE
-    engine_cache_instance: BaseEngineCache = ENGINE_CACHE_INSTANCE
+    cache_built_engines: bool = CACHE_BUILT_ENGINES
+    reuse_cached_engines: bool = REUSE_CACHED_ENGINES
+    custom_engine_cache: Optional[BaseEngineCache] = CUSTOM_ENGINE_CACHE
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index cffcea5b9f..a422f7dde4 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -522,30 +522,35 @@ def run(
             TRTInterpreterResult
         """
         if (
-            self.compilation_settings.save_engine_cache
-            or self.compilation_settings.load_engine_cache
-        ):
-            engine_cache = self.compilation_settings.engine_cache_instance
-            hash_val = engine_cache.get_hash(self.module)
-
-        if self.compilation_settings.load_engine_cache:
-            # query the cached TRT engine
-            serialized_engine, input_names, output_names, weight_name_map = (
-                engine_cache.load(hash_val)
-            )
-            if serialized_engine is not None:
-                self._input_names = input_names
-                self._output_names = output_names
-                self.weight_name_map = weight_name_map
-                _LOGGER.info(
-                    "Hit the cached TRT engine. It is loaded for skipping recompilation."
-                )
-                return TRTInterpreterResult(
-                    serialized_engine,
-                    self._input_names,
-                    self._output_names,
-                    self.weight_name_map,
-                )
+            self.compilation_settings.custom_engine_cache is not None
+        ):  # custom_engine_cache could be None if this function is called from convert_exported_program_to_serialized_trt_engine etc.
+            if (
+                self.compilation_settings.cache_built_engines
+                or self.compilation_settings.reuse_cached_engines
+            ):
+                engine_cache = self.compilation_settings.custom_engine_cache
+                hash_val = engine_cache.get_hash(self.module)
+
+            if self.compilation_settings.reuse_cached_engines:
+                # query the cached TRT engine
+                blob = engine_cache.load(hash_val)
+                if blob is not None:  # hit the cache
+                    serialized_engine, input_names, output_names, weight_name_map = (
+                        engine_cache.unpack(blob)
+                    )
+                    self._input_names = input_names
+                    self._output_names = output_names
+                    self.weight_name_map = weight_name_map
+                    _LOGGER.info(
+                        "Hit the cached TRT engine. It is loaded and skip recompilation."
+                    )
+                    # TODO: refit the engine here or outside (within convert_module)?
+                    return TRTInterpreterResult(
+                        serialized_engine,
+                        self._input_names,
+                        self._output_names,
+                        self.weight_name_map,
+                    )
 
         self._construct_trt_network_def()
 
@@ -576,14 +581,17 @@ def run(
         self._save_timing_cache(
             builder_config, self.compilation_settings.timing_cache_path
         )
-        if self.compilation_settings.save_engine_cache:
-            engine_cache.save(
-                hash_val,
+        if (
+            self.compilation_settings.custom_engine_cache is not None
+            and self.compilation_settings.cache_built_engines
+        ):
+            blob = engine_cache.pack(
                 serialized_engine,
                 self._input_names,
                 self._output_names,
                 self.weight_name_map,
             )
+            engine_cache.save(hash_val, blob)
 
         with io.BytesIO() as engine_bytes:
             engine_bytes.write(serialized_engine)
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index dfd22e7f9f..3945e976d6 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -495,6 +495,22 @@ def parse_dynamo_kwargs(kwargs: Any) -> CompilationSettings:
         )
         settings.require_full_compilation = False
 
+    # If cache_built_engines and reuse_cached_engines are True but custom_engine_cache is not provided,
+    # then create a default disk engine cache
+    if kwargs.get("cache_built_engines") or kwargs.get("reuse_cached_engines"):
+        if settings.custom_engine_cache is None:
+            from torch_tensorrt.dynamo._engine_caching import DiskEngineCache
+
+            engine_cache_dir = kwargs.get(
+                "engine_cache_dir", _defaults.ENGINE_CACHE_DIR
+            )
+            engine_cache_size = kwargs.get(
+                "engine_cache_size", _defaults.ENGINE_CACHE_SIZE
+            )
+            settings.custom_engine_cache = DiskEngineCache(
+                engine_cache_dir, engine_cache_size
+            )
+
     logger.info("Compilation Settings: %s\n", settings)
 
     return settings

From 64885deb25651b420fc6171252e87e921a118218 Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Wed, 21 Aug 2024 18:01:22 -0700
Subject: [PATCH 07/14] small fixes

---
 py/torch_tensorrt/dynamo/_engine_caching.py            | 4 ++--
 py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py
index 01220233ea..ee5a6ec854 100644
--- a/py/torch_tensorrt/dynamo/_engine_caching.py
+++ b/py/torch_tensorrt/dynamo/_engine_caching.py
@@ -8,7 +8,7 @@
 
 import torch
 from torch._inductor.codecache import FxGraphCachePickler
-from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode
+from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
@@ -34,7 +34,7 @@ def get_hash(gm: torch.fx.GraphModule) -> str:
             str: hash value of the GraphModule
         """
         # parameters are set to 0
-        with maybe_disable_fake_tensor_mode():
+        with unset_fake_temporarily():
             new_gm = copy.deepcopy(gm)
             for name, param in new_gm.named_parameters():
                 param.data.zero_()
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index a422f7dde4..976c943e0d 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -18,6 +18,7 @@
 )
 
 import numpy as np
+import tensorrt as trt
 import torch
 import torch.fx
 from torch.fx.node import _get_qualified_name
@@ -42,7 +43,6 @@
 from torch_tensorrt.fx.observer import Observer
 from torch_tensorrt.logging import TRT_LOGGER
 
-import tensorrt as trt
 from packaging import version
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -542,7 +542,7 @@ def run(
                     self._output_names = output_names
                     self.weight_name_map = weight_name_map
                     _LOGGER.info(
-                        "Hit the cached TRT engine. It is loaded and skip recompilation."
+                        "Found the cached engine that corresponds to this graph. It is directly loaded."
                     )
                     # TODO: refit the engine here or outside (within convert_module)?
                     return TRTInterpreterResult(

From 88d1a2507dad0e0a1e417f7ba418c5eebad834ef Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Fri, 23 Aug 2024 17:52:39 -0700
Subject: [PATCH 08/14] move refit into interpret_module_to_result

---
 py/torch_tensorrt/dynamo/_engine_caching.py   |  6 +-
 .../dynamo/conversion/_TRTInterpreter.py      |  1 -
 .../dynamo/conversion/_conversion.py          | 57 ++++++++++---------
 3 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py
index ee5a6ec854..c8ff7aba50 100644
--- a/py/torch_tensorrt/dynamo/_engine_caching.py
+++ b/py/torch_tensorrt/dynamo/_engine_caching.py
@@ -48,7 +48,7 @@ def pack(
         serialized_engine: bytes,
         input_names: List[str],
         output_names: List[str],
-        weight_name_map: Optional[Dict[str, Any]],
+        weight_name_map: Optional[Dict[Any, Any]],
     ) -> bytes:
         """Pack serialized engine, input names, output names, and weight map into a single blob
 
@@ -56,7 +56,7 @@ def pack(
             serialized_engine (bytes): serialized TRT engine
             input_names (List[str]): input names of TRT engine
             output_names (List[str]): output names of TRT engine
-            weight_name_map (Optional[Dict[str, Any]]): weight name map for refitting
+            weight_name_map (Optional[Dict[Any, Any]]): weight name map for refitting
 
         Returns:
             bytes: packed blob
@@ -73,7 +73,7 @@ def pack(
     @staticmethod
     def unpack(
         packed_obj: bytes,
-    ) -> Tuple[bytes, List[str], List[str], Optional[Dict[str, Any]]]:
+    ) -> Tuple[bytes, List[str], List[str], Optional[Dict[Any, Any]]]:
         """Unpack packed blob into serialized engine, input names, output names, and weight map
 
         Args:
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index 976c943e0d..d10a5bca38 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -544,7 +544,6 @@ def run(
                     _LOGGER.info(
                         "Found the cached engine that corresponds to this graph. It is directly loaded."
                     )
-                    # TODO: refit the engine here or outside (within convert_module)?
                     return TRTInterpreterResult(
                         serialized_engine,
                         self._input_names,
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
index e0643cf996..03bf14dfc6 100644
--- a/py/torch_tensorrt/dynamo/conversion/_conversion.py
+++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -114,6 +114,36 @@ def interpret_module_to_result(
     )
 
     interpreter_result = interpreter.run()
+
+    if settings.make_refitable:
+        # Run fast refit even if it's the first compilation.
+        # This is to ensure that the weight name map is correct for future refits.
+        # If the fast refit fails, remove the weight name map.
+        from torch_tensorrt.dynamo._refit import _refit_single_trt_engine_with_gm
+        from torch_tensorrt.logging import TRT_LOGGER
+
+        runtime = trt.Runtime(TRT_LOGGER)
+        refit_test_engine = runtime.deserialize_cuda_engine(
+            interpreter_result.serialized_engine
+        )
+        try:
+            _refit_single_trt_engine_with_gm(
+                new_gm=module,
+                old_engine=refit_test_engine,
+                input_list=inputs,
+                settings=settings,
+                weight_name_map=interpreter_result.weight_name_map,
+            )
+        except AssertionError:
+            # TRTInterpreterResult is a tuple, so we need to create a new one
+            interpreter_result = TRTInterpreterResult(
+                interpreter_result.serialized_engine,
+                interpreter_result.input_names,
+                interpreter_result.output_names,
+                None,
+            )
+            logger.warning("Fast refit test failed. Removing the weight map caching.")
+
     return interpreter_result
 
 
@@ -133,31 +163,6 @@ def convert_module(
         PythonTorchTensorRTModule or TorchTensorRTModule
     """
     interpreter_result = interpret_module_to_result(module, inputs, settings)
-    # Test fast refit:
-    from torch_tensorrt.dynamo._refit import _refit_single_trt_engine_with_gm
-    from torch_tensorrt.logging import TRT_LOGGER
-
-    weight_name_map: Any = None
-    # Do the test refit with cached map if make_refitable is enabled
-    if settings.make_refitable:
-        runtime = trt.Runtime(TRT_LOGGER)
-        refit_test_engine = runtime.deserialize_cuda_engine(
-            interpreter_result.serialized_engine
-        )
-        try:
-            _refit_single_trt_engine_with_gm(
-                new_gm=module,
-                old_engine=refit_test_engine,
-                input_list=inputs,
-                settings=settings,
-                weight_name_map=interpreter_result.weight_name_map,
-            )
-            weight_name_map = interpreter_result.weight_name_map
-        except AssertionError:
-            logger.warning("Fast refit test failed. Removing the weight map caching.")
-
-        del refit_test_engine
-        torch.cuda.empty_cache()
 
     rt_cls = PythonTorchTensorRTModule
 
@@ -181,5 +186,5 @@ def convert_module(
         output_binding_names=list(interpreter_result.output_names),
         name=name,
         settings=settings,
-        weight_name_map=weight_name_map,
+        weight_name_map=interpreter_result.weight_name_map,
     )

From 81eb7c53eeeb38f4833755615729df6baab3262c Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Tue, 27 Aug 2024 12:35:59 -0700
Subject: [PATCH 09/14] update refit usage

---
 py/torch_tensorrt/dynamo/_compiler.py         |  3 ++
 .../dynamo/conversion/_TRTInterpreter.py      | 18 +++++++++++
 .../dynamo/conversion/_conversion.py          | 30 -------------------
 py/torch_tensorrt/dynamo/utils.py             |  3 ++
 4 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index bc31592c06..349243769c 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -236,6 +236,9 @@ def compile(
     logger.debug("Lowered Input graph: " + str(gm.graph))
 
     if cache_built_engines or reuse_cached_engines:
+        assert (
+            make_refitable
+        ), "Engine caching requires make_refitable to be set to True"
         if custom_engine_cache is None:
             custom_engine_cache = DiskEngineCache(engine_cache_dir, engine_cache_size)
 
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index d10a5bca38..60c5012f77 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -544,6 +544,24 @@ def run(
                     _LOGGER.info(
                         "Found the cached engine that corresponds to this graph. It is directly loaded."
                     )
+
+                    from torch_tensorrt.dynamo._refit import (
+                        _refit_single_trt_engine_with_gm,
+                    )
+
+                    runtime = trt.Runtime(TRT_LOGGER)
+                    engine = runtime.deserialize_cuda_engine(serialized_engine)
+
+                    _refit_single_trt_engine_with_gm(
+                        new_gm=self.module,
+                        old_engine=engine,
+                        input_list=self.input_specs,
+                        settings=self.compilation_settings,
+                        weight_name_map=weight_name_map,
+                    )
+
+                    serialized_engine = bytes(engine.serialize())
+
                     return TRTInterpreterResult(
                         serialized_engine,
                         self._input_names,
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
index 03bf14dfc6..36d7bd71e9 100644
--- a/py/torch_tensorrt/dynamo/conversion/_conversion.py
+++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -114,36 +114,6 @@ def interpret_module_to_result(
     )
 
     interpreter_result = interpreter.run()
-
-    if settings.make_refitable:
-        # Run fast refit even if it's the first compilation.
-        # This is to ensure that the weight name map is correct for future refits.
-        # If the fast refit fails, remove the weight name map.
-        from torch_tensorrt.dynamo._refit import _refit_single_trt_engine_with_gm
-        from torch_tensorrt.logging import TRT_LOGGER
-
-        runtime = trt.Runtime(TRT_LOGGER)
-        refit_test_engine = runtime.deserialize_cuda_engine(
-            interpreter_result.serialized_engine
-        )
-        try:
-            _refit_single_trt_engine_with_gm(
-                new_gm=module,
-                old_engine=refit_test_engine,
-                input_list=inputs,
-                settings=settings,
-                weight_name_map=interpreter_result.weight_name_map,
-            )
-        except AssertionError:
-            # TRTInterpreterResult is a tuple, so we need to create a new one
-            interpreter_result = TRTInterpreterResult(
-                interpreter_result.serialized_engine,
-                interpreter_result.input_names,
-                interpreter_result.output_names,
-                None,
-            )
-            logger.warning("Fast refit test failed. Removing the weight map caching.")
-
     return interpreter_result
 
 
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 3945e976d6..460fd7a9f8 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -498,6 +498,9 @@ def parse_dynamo_kwargs(kwargs: Any) -> CompilationSettings:
     # If cache_built_engines and reuse_cached_engines are True but custom_engine_cache is not provided,
     # then create a default disk engine cache
     if kwargs.get("cache_built_engines") or kwargs.get("reuse_cached_engines"):
+        assert kwargs.get(
+            "make_refitable"
+        ), "Engine caching requires make_refitable to be set to True"
         if settings.custom_engine_cache is None:
             from torch_tensorrt.dynamo._engine_caching import DiskEngineCache
 

From 04bb63a0475abfada83c1444dec65876433b5a6d Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Tue, 27 Aug 2024 19:52:12 -0700
Subject: [PATCH 10/14] force using slow refit, add unit tests

---
 examples/dynamo/engine_caching_example.py     |  32 ++--
 .../dynamo/conversion/_TRTInterpreter.py      |  18 ++-
 tests/py/dynamo/models/test_engine_cache.py   | 153 ++++++++++++++++++
 3 files changed, 185 insertions(+), 18 deletions(-)
 create mode 100644 tests/py/dynamo/models/test_engine_cache.py

diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
index 89912e74b0..2d1018bb6e 100644
--- a/examples/dynamo/engine_caching_example.py
+++ b/examples/dynamo/engine_caching_example.py
@@ -10,7 +10,6 @@
 
 np.random.seed(0)
 torch.manual_seed(0)
-size = (100, 3, 224, 224)
 
 model = models.resnet18(pretrained=True).eval().to("cuda")
 enabled_precisions = {torch.float}
@@ -24,7 +23,7 @@ def remove_timing_cache(path=TIMING_CACHE_PATH):
         os.remove(path)
 
 
-def dynamo_path(iterations=3):
+def dynamo_compile(iterations=3):
     times = []
     start = torch.cuda.Event(enable_timing=True)
     end = torch.cuda.Event(enable_timing=True)
@@ -42,7 +41,7 @@ def dynamo_path(iterations=3):
     # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
     for i in range(iterations):
         inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
-        remove_timing_cache()  # remove timing cache for engine caching messurement
+        remove_timing_cache()  # remove timing cache just for engine caching messurement
         if i == 0:
             cache_built_engines = False
             reuse_cached_engines = False
@@ -63,11 +62,15 @@ def dynamo_path(iterations=3):
             reuse_cached_engines=reuse_cached_engines,
             engine_cache_size=1 << 30,  # 1GB
         )
+        # output = trt_gm(*inputs)
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
 
-    print("-----dynamo_path-----> compilation time:\n", times, "milliseconds")
+    print("----------------dynamo_compile----------------")
+    print("disable engine caching, used:", times[0], "ms")
+    print("enable engine caching to cache engines, used:", times[1], "ms")
+    print("enable engine caching to reuse engines, used:", times[2], "ms")
 
 
 # Custom Engine Cache
@@ -84,11 +87,13 @@ def save(
         blob: bytes,
         prefix: str = "blob",
     ):
+        if not os.path.exists(self.engine_cache_dir):
+            os.makedirs(self.engine_cache_dir, exist_ok=True)
+
         path = os.path.join(
             self.engine_cache_dir,
             f"{prefix}_{hash}.bin",
         )
-        os.makedirs(path, exist_ok=True)
         with open(path, "wb") as f:
             f.write(blob)
 
@@ -101,7 +106,7 @@ def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]:
         return None
 
 
-def compile_path(iterations=3):
+def torch_compile(iterations=3):
     times = []
     engine_cache = MyEngineCache("/tmp/your_dir")
     start = torch.cuda.Event(enable_timing=True)
@@ -112,8 +117,8 @@ def compile_path(iterations=3):
     # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
     # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
     for i in range(iterations):
-        inputs = [torch.rand(size).to("cuda")]
-        # remove timing cache and reset dynamo for engine caching messurement
+        inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
+        # remove timing cache and reset dynamo just for engine caching messurement
         remove_timing_cache()
         torch._dynamo.reset()
 
@@ -129,7 +134,7 @@ def compile_path(iterations=3):
             model,
             backend="tensorrt",
             options={
-                "use_python_runtime": use_python_runtime,
+                "use_python_runtime": True,
                 "enabled_precisions": enabled_precisions,
                 "debug": debug,
                 "min_block_size": min_block_size,
@@ -144,9 +149,12 @@ def compile_path(iterations=3):
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
 
-    print("-----compile_path-----> compilation time:\n", times, "milliseconds")
+    print("----------------torch_compile----------------")
+    print("disable engine caching, used:", times[0], "ms")
+    print("enable engine caching to cache engines, used:", times[1], "ms")
+    print("enable engine caching to reuse engines, used:", times[2], "ms")
 
 
 if __name__ == "__main__":
-    dynamo_path()
-    # compile_path()
+    dynamo_compile()
+    torch_compile()
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index 60c5012f77..16a1e0c75b 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -545,25 +545,31 @@ def run(
                         "Found the cached engine that corresponds to this graph. It is directly loaded."
                     )
 
+                    runtime = trt.Runtime(TRT_LOGGER)
+                    engine = runtime.deserialize_cuda_engine(serialized_engine)
+
                     from torch_tensorrt.dynamo._refit import (
                         _refit_single_trt_engine_with_gm,
                     )
 
-                    runtime = trt.Runtime(TRT_LOGGER)
-                    engine = runtime.deserialize_cuda_engine(serialized_engine)
-
+                    # TODO: Fast refit is problematic for now. It will fail if the engine has batch_norm layers.
+                    # We set weight_name_map=None to use slow refit anyway for now. Will fix it in the future.
                     _refit_single_trt_engine_with_gm(
                         new_gm=self.module,
                         old_engine=engine,
                         input_list=self.input_specs,
                         settings=self.compilation_settings,
-                        weight_name_map=weight_name_map,
+                        weight_name_map=None,
                     )
 
-                    serialized_engine = bytes(engine.serialize())
+                    serialized_engine = engine.serialize()
+
+                    with io.BytesIO() as engine_bytes:
+                        engine_bytes.write(serialized_engine)
+                        engine_str = engine_bytes.getvalue()
 
                     return TRTInterpreterResult(
-                        serialized_engine,
+                        engine_str,
                         self._input_names,
                         self._output_names,
                         self.weight_name_map,
diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py
new file mode 100644
index 0000000000..7b6247ced9
--- /dev/null
+++ b/tests/py/dynamo/models/test_engine_cache.py
@@ -0,0 +1,153 @@
+# type: ignore
+import os
+import shutil
+import unittest
+from typing import Optional
+
+import torch
+import torch_tensorrt as torch_trt
+import torchvision.models as models
+from torch.testing._internal.common_utils import TestCase
+from torch_tensorrt.dynamo._defaults import ENGINE_CACHE_DIR
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache
+from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity
+
+assertions = unittest.TestCase()
+
+
+class MyEngineCache(BaseEngineCache):
+    def __init__(
+        self,
+        engine_cache_dir: str,
+    ) -> None:
+        self.engine_cache_dir = engine_cache_dir
+
+    def save(
+        self,
+        hash: str,
+        blob: bytes,
+        prefix: str = "blob",
+    ):
+        if not os.path.exists(self.engine_cache_dir):
+            os.makedirs(self.engine_cache_dir, exist_ok=True)
+
+        path = os.path.join(
+            self.engine_cache_dir,
+            f"{prefix}_{hash}.bin",
+        )
+        with open(path, "wb") as f:
+            f.write(blob)
+
+    def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]:
+        path = os.path.join(self.engine_cache_dir, f"{prefix}_{hash}.bin")
+        if os.path.exists(path):
+            with open(path, "rb") as f:
+                blob = f.read()
+            return blob
+        return None
+
+
+class TestEngineCache(TestCase):
+
+    def test_dynamo_compile(self):
+        model = models.resnet18(pretrained=True).eval().to("cuda")
+        example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),)
+        # Mark the dim0 of inputs as dynamic
+        batch = torch.export.Dim("batch", min=1, max=200)
+        exp_program = torch.export.export(
+            model, args=example_inputs, dynamic_shapes={"x": {0: batch}}
+        )
+        engine_cache_dir = ENGINE_CACHE_DIR
+        if os.path.exists(engine_cache_dir):
+            shutil.rmtree(engine_cache_dir)
+        # The 1st iteration is to measure the compilation time without engine caching
+        # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+        # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+        # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
+        inputs = [torch.rand((128, 3, 224, 224)).to("cuda")]
+        results = []
+        for i in range(3):
+            if i == 0:
+                cache_built_engines = False
+                reuse_cached_engines = False
+            else:
+                cache_built_engines = True
+                reuse_cached_engines = True
+
+            trt_gm = torch_trt.dynamo.compile(
+                exp_program,
+                tuple(inputs),
+                use_python_runtime=False,
+                enabled_precisions={torch.float},
+                debug=False,
+                min_block_size=1,
+                make_refitable=True,
+                cache_built_engines=cache_built_engines,
+                reuse_cached_engines=reuse_cached_engines,
+                engine_cache_size=1 << 30,  # 1GB
+            )
+            results.append(trt_gm(*inputs))
+
+        cos_sim = cosine_similarity(results[0], results[1])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_dynamo_compile TRT without engine caching doesn't match with that with engine caching. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+        cos_sim = cosine_similarity(results[1], results[2])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_dynamo_compile TRT with engine caching doesn't match with that cached engine. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+    def test_torch_compile(self):
+        # Custom Engine Cache
+        model = models.resnet18(pretrained=True).eval().to("cuda")
+
+        engine_cache_dir = "/tmp/your_dir"
+        if os.path.exists(engine_cache_dir):
+            shutil.rmtree(engine_cache_dir)
+
+        engine_cache = MyEngineCache(engine_cache_dir)
+        # The 1st iteration is to measure the compilation time without engine caching
+        # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+        # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+        # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
+        inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
+        results = []
+        for i in range(3):
+            # remove timing cache and reset dynamo for engine caching messurement
+            if i == 0:
+                cache_built_engines = False
+                reuse_cached_engines = False
+            else:
+                cache_built_engines = True
+                reuse_cached_engines = True
+
+            compiled_model = torch.compile(
+                model,
+                backend="tensorrt",
+                options={
+                    "use_python_runtime": True,
+                    "enabled_precisions": {torch.float},
+                    "debug": False,
+                    "min_block_size": 1,
+                    "make_refitable": True,
+                    "cache_built_engines": cache_built_engines,
+                    "reuse_cached_engines": reuse_cached_engines,
+                    "custom_engine_cache": engine_cache,  # use custom engine cache
+                },
+            )
+            results.append(compiled_model(*inputs))  # trigger the compilation
+
+        cos_sim = cosine_similarity(results[0], results[1])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_torch_compile TRT without engine caching doesn't match with that with engine caching. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+        cos_sim = cosine_similarity(results[1], results[2])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_torch_compile TRT with engine caching doesn't match with that cached engine. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )

From 16315dd5bc2f460748f59c35cca0e6a31f2f1177 Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Wed, 28 Aug 2024 14:41:48 -0700
Subject: [PATCH 11/14] fix issues from comments, add more unit tests

---
 py/torch_tensorrt/dynamo/_compiler.py         |  24 ++-
 py/torch_tensorrt/dynamo/_settings.py         |   4 -
 py/torch_tensorrt/dynamo/backend/backends.py  |   7 +-
 .../dynamo/conversion/_TRTInterpreter.py      |  25 ++-
 .../dynamo/conversion/_conversion.py          |  10 +-
 py/torch_tensorrt/dynamo/utils.py             |  19 +-
 tests/py/dynamo/models/test_engine_cache.py   | 175 +++++++++++++++++-
 7 files changed, 223 insertions(+), 41 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 349243769c..c28702f451 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -85,8 +85,8 @@ def compile(
     lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT,
     cache_built_engines: bool = _defaults.CACHE_BUILT_ENGINES,
     reuse_cached_engines: bool = _defaults.REUSE_CACHED_ENGINES,
-    engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR,
-    engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE,
+    engine_cache_dir: Optional[str] = _defaults.ENGINE_CACHE_DIR,
+    engine_cache_size: Optional[int] = _defaults.ENGINE_CACHE_SIZE,
     custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
@@ -155,8 +155,8 @@ def compile(
         lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
         cache_built_engines (bool): Whether to save the compiled TRT engines to storage
         reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
-        engine_cache_dir (str): Directory to store the cached TRT engines
-        engine_cache_size (int): Maximum hard-disk space to use for the engine cache
+        engine_cache_dir (Optional[str]): Directory to store the cached TRT engines
+        engine_cache_size (Optional[int]): Maximum hard-disk space (bytes) to use for the engine cache, default is 1GB. If the cache exceeds this size, the oldest engines will be removed by default
         custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored.
         **kwargs: Any,
     Returns:
@@ -235,12 +235,16 @@ def compile(
     gm = post_lowering(gm)
     logger.debug("Lowered Input graph: " + str(gm.graph))
 
+    engine_cache = None
     if cache_built_engines or reuse_cached_engines:
         assert (
             make_refitable
         ), "Engine caching requires make_refitable to be set to True"
-        if custom_engine_cache is None:
-            custom_engine_cache = DiskEngineCache(engine_cache_dir, engine_cache_size)
+        engine_cache = (
+            custom_engine_cache
+            if custom_engine_cache is not None
+            else DiskEngineCache(engine_cache_dir, engine_cache_size)
+        )
 
     compilation_options = {
         "enabled_precisions": (
@@ -277,12 +281,13 @@ def compile(
         "lazy_engine_init": lazy_engine_init,
         "cache_built_engines": cache_built_engines,
         "reuse_cached_engines": reuse_cached_engines,
-        "custom_engine_cache": custom_engine_cache,
     }
 
     settings = CompilationSettings(**compilation_options)
     logger.info("Compilation Settings: %s\n", settings)
-    trt_gm = compile_module(gm, trt_arg_inputs, trt_kwarg_inputs, settings)
+    trt_gm = compile_module(
+        gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache
+    )
     return trt_gm
 
 
@@ -291,6 +296,7 @@ def compile_module(
     sample_arg_inputs: Sequence[Input],
     sample_kwarg_inputs: Optional[dict[Any, Any]] = None,
     settings: CompilationSettings = CompilationSettings(),
+    engine_cache: Optional[BaseEngineCache] = None,
 ) -> torch.fx.GraphModule:
     """Compile a traced FX module
 
@@ -301,6 +307,7 @@ def compile_module(
         arg_inputs: Inputs to the module
         kwarg_inputs: kwargs to the module
         settings: Compilation settings
+        engine_cache: Engine cache instance to store/load compiled engines
     Returns:
         Compiled FX GraphModule
     """
@@ -457,6 +464,7 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
                 submodule_inputs,
                 settings=settings,
                 name=name,
+                engine_cache=engine_cache,
             )
 
             trt_modules[name] = trt_module
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
index 0327727c9f..063f6f3718 100644
--- a/py/torch_tensorrt/dynamo/_settings.py
+++ b/py/torch_tensorrt/dynamo/_settings.py
@@ -7,7 +7,6 @@
 from torch_tensorrt.dynamo._defaults import (
     ASSUME_DYNAMIC_SHAPE_SUPPORT,
     CACHE_BUILT_ENGINES,
-    CUSTOM_ENGINE_CACHE,
     DEBUG,
     DISABLE_TF32,
     DLA_GLOBAL_DRAM_SIZE,
@@ -36,7 +35,6 @@
     WORKSPACE_SIZE,
     default_device,
 )
-from torch_tensorrt.dynamo._engine_caching import BaseEngineCache
 
 
 @dataclass
@@ -80,7 +78,6 @@ class CompilationSettings:
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
         cache_built_engines (bool): Whether to save the compiled TRT engines to storage
         reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
-        custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -115,4 +112,3 @@ class CompilationSettings:
     lazy_engine_init: bool = LAZY_ENGINE_INIT
     cache_built_engines: bool = CACHE_BUILT_ENGINES
     reuse_cached_engines: bool = REUSE_CACHED_ENGINES
-    custom_engine_cache: Optional[BaseEngineCache] = CUSTOM_ENGINE_CACHE
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
index ae3cb38f2d..605d963a50 100644
--- a/py/torch_tensorrt/dynamo/backend/backends.py
+++ b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -48,14 +48,15 @@ def torch_tensorrt_backend(
 def aot_torch_tensorrt_aten_backend(
     gm: torch.fx.GraphModule, sample_inputs: Sequence[Any], **kwargs: Any
 ) -> torch.nn.Module:
-    settings = parse_dynamo_kwargs(kwargs)
-    return _pretraced_backend(gm, sample_inputs, settings)
+    settings, engine_cache = parse_dynamo_kwargs(kwargs)
+    return _pretraced_backend(gm, sample_inputs, settings, engine_cache)
 
 
 def _pretraced_backend(
     gm: torch.fx.GraphModule,
     sample_inputs: Sequence[Any],
     settings: CompilationSettings = CompilationSettings(),
+    engine_cache: Any = None,
 ) -> torch.fx.GraphModule | Callable[..., Any]:
     """Helper function to manage translation of traced FX module to TRT engines
 
@@ -63,6 +64,7 @@ def _pretraced_backend(
         module: FX GraphModule to convert
         inputs: Inputs to the module
         settings: Compilation settings
+        engine_cache: Engine cache instance
     Returns:
         Compiled FX GraphModule
     """
@@ -109,6 +111,7 @@ def _pretraced_backend(
                 gm,
                 torchtrt_inputs,
                 settings=settings,
+                engine_cache=engine_cache,
             )
             return trt_compiled
     except (AssertionError, RuntimeError):
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index 16a1e0c75b..22743af0aa 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -27,6 +27,7 @@
 from torch_tensorrt._enums import dtype
 from torch_tensorrt._Input import Input
 from torch_tensorrt.dynamo import _defaults
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
@@ -71,6 +72,7 @@ def __init__(
         logger_level: trt.ILogger.Severity = trt.ILogger.Severity.WARNING,
         output_dtypes: Optional[Sequence[dtype]] = None,
         compilation_settings: CompilationSettings = CompilationSettings(),
+        engine_cache: Optional[BaseEngineCache] = None,
     ):
         super().__init__(module)
 
@@ -126,6 +128,9 @@ def __init__(
         self.const_mapping: Dict[str, Tuple[Sequence[int], str]] = {}
         self.weight_name_map: Optional[dict[str, Any]] = None
 
+        # Engine cache for storing and reusing TRT engines
+        self.engine_cache = engine_cache
+
     def validate_conversion(self) -> Set[str]:
         missing_converters: Set[str] = set()
 
@@ -521,22 +526,22 @@ def run(
         Return:
             TRTInterpreterResult
         """
-        if (
-            self.compilation_settings.custom_engine_cache is not None
-        ):  # custom_engine_cache could be None if this function is called from convert_exported_program_to_serialized_trt_engine etc.
+        # self.engine_cache could be None if:
+        # 1) engine_cache is not passed in when calling this function like convert_exported_program_to_serialized_trt_engine etc., or
+        # 2) both cache_built_engines and reuse_cached_engines are False
+        if self.engine_cache is not None:
             if (
                 self.compilation_settings.cache_built_engines
                 or self.compilation_settings.reuse_cached_engines
             ):
-                engine_cache = self.compilation_settings.custom_engine_cache
-                hash_val = engine_cache.get_hash(self.module)
+                hash_val = self.engine_cache.get_hash(self.module)
 
             if self.compilation_settings.reuse_cached_engines:
                 # query the cached TRT engine
-                blob = engine_cache.load(hash_val)
+                blob = self.engine_cache.load(hash_val)
                 if blob is not None:  # hit the cache
                     serialized_engine, input_names, output_names, weight_name_map = (
-                        engine_cache.unpack(blob)
+                        self.engine_cache.unpack(blob)
                     )
                     self._input_names = input_names
                     self._output_names = output_names
@@ -605,16 +610,16 @@ def run(
             builder_config, self.compilation_settings.timing_cache_path
         )
         if (
-            self.compilation_settings.custom_engine_cache is not None
+            self.engine_cache is not None
             and self.compilation_settings.cache_built_engines
         ):
-            blob = engine_cache.pack(
+            blob = self.engine_cache.pack(
                 serialized_engine,
                 self._input_names,
                 self._output_names,
                 self.weight_name_map,
             )
-            engine_cache.save(hash_val, blob)
+            self.engine_cache.save(hash_val, blob)
 
         with io.BytesIO() as engine_bytes:
             engine_bytes.write(serialized_engine)
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
index 36d7bd71e9..cd38ce56e6 100644
--- a/py/torch_tensorrt/dynamo/conversion/_conversion.py
+++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -10,6 +10,7 @@
 from torch_tensorrt._enums import dtype
 from torch_tensorrt._features import ENABLED_FEATURES
 from torch_tensorrt._Input import Input
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.conversion._TRTInterpreter import (
     TRTInterpreter,
@@ -76,6 +77,7 @@ def interpret_module_to_result(
     settings: CompilationSettings = CompilationSettings(),
     arg_inputs: Optional[Sequence[Input]] = None,
     kwarg_inputs: Optional[dict[str, Any]] = None,
+    engine_cache: Optional[BaseEngineCache] = None,
 ) -> TRTInterpreterResult:
     """Interpret an FX module to a TRTInterpreterResult
     Args:
@@ -85,6 +87,7 @@ def interpret_module_to_result(
         arg_inputs: Sequence of Tensors representing inputs to the module.
         kwarg_inputs: A dictionary of Tensors representing inputs to the module.
         settings: Compilation settings
+        engine_cache: Engine cache instance
     Returns:
         TRTInterpreterResult
     """
@@ -111,6 +114,7 @@ def interpret_module_to_result(
         logger_level=(trt.Logger.VERBOSE if settings.debug else trt.Logger.WARNING),
         output_dtypes=output_dtypes,
         compilation_settings=settings,
+        engine_cache=engine_cache,
     )
 
     interpreter_result = interpreter.run()
@@ -122,6 +126,7 @@ def convert_module(
     inputs: Sequence[Input],
     settings: CompilationSettings = CompilationSettings(),
     name: str = "",
+    engine_cache: Optional[BaseEngineCache] = None,
 ) -> PythonTorchTensorRTModule | TorchTensorRTModule:
     """Convert an FX module to a TRT module
     Args:
@@ -129,10 +134,13 @@ def convert_module(
         inputs: Sequence of Tensors representing inputs to the module
         settings: Compilation settings
         name: TRT engine name
+        engine_cache: Engine cache instance
     Returns:
         PythonTorchTensorRTModule or TorchTensorRTModule
     """
-    interpreter_result = interpret_module_to_result(module, inputs, settings)
+    interpreter_result = interpret_module_to_result(
+        module, inputs, settings, engine_cache=engine_cache
+    )
 
     rt_cls = PythonTorchTensorRTModule
 
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 460fd7a9f8..66192d59a0 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -3,7 +3,7 @@
 import logging
 from dataclasses import fields, replace
 from enum import Enum
-from typing import Any, Callable, Dict, Optional, Sequence, Union
+from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import tensorrt as trt
@@ -13,6 +13,7 @@
 from torch_tensorrt._enums import dtype
 from torch_tensorrt._Input import Input
 from torch_tensorrt.dynamo import _defaults
+from torch_tensorrt.dynamo._engine_caching import BaseEngineCache
 from torch_tensorrt.dynamo._settings import CompilationSettings
 
 from packaging import version
@@ -438,7 +439,9 @@ def to_torch_tensorrt_device(
     return Device._from(device)
 
 
-def parse_dynamo_kwargs(kwargs: Any) -> CompilationSettings:
+def parse_dynamo_kwargs(
+    kwargs: Any,
+) -> Tuple[CompilationSettings, Optional[BaseEngineCache]]:
     """Parses the kwargs field of a Dynamo backend
 
     Args:
@@ -497,11 +500,15 @@ def parse_dynamo_kwargs(kwargs: Any) -> CompilationSettings:
 
     # If cache_built_engines and reuse_cached_engines are True but custom_engine_cache is not provided,
     # then create a default disk engine cache
+    engine_cache = None
     if kwargs.get("cache_built_engines") or kwargs.get("reuse_cached_engines"):
         assert kwargs.get(
             "make_refitable"
         ), "Engine caching requires make_refitable to be set to True"
-        if settings.custom_engine_cache is None:
+
+        if kwargs.get("custom_engine_cache") is not None:
+            engine_cache = kwargs.get("custom_engine_cache")
+        else:
             from torch_tensorrt.dynamo._engine_caching import DiskEngineCache
 
             engine_cache_dir = kwargs.get(
@@ -510,13 +517,11 @@ def parse_dynamo_kwargs(kwargs: Any) -> CompilationSettings:
             engine_cache_size = kwargs.get(
                 "engine_cache_size", _defaults.ENGINE_CACHE_SIZE
             )
-            settings.custom_engine_cache = DiskEngineCache(
-                engine_cache_dir, engine_cache_size
-            )
+            engine_cache = DiskEngineCache(engine_cache_dir, engine_cache_size)
 
     logger.info("Compilation Settings: %s\n", settings)
 
-    return settings
+    return settings, engine_cache
 
 
 def req_torch_version(min_torch_version: str = "2.dev") -> Callable[..., Any]:
diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py
index 7b6247ced9..1a5b874eb4 100644
--- a/tests/py/dynamo/models/test_engine_cache.py
+++ b/tests/py/dynamo/models/test_engine_cache.py
@@ -49,7 +49,7 @@ def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]:
 
 class TestEngineCache(TestCase):
 
-    def test_dynamo_compile(self):
+    def test_dynamo_compile_with_default_disk_engine_cache(self):
         model = models.resnet18(pretrained=True).eval().to("cuda")
         example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),)
         # Mark the dim0 of inputs as dynamic
@@ -57,15 +57,87 @@ def test_dynamo_compile(self):
         exp_program = torch.export.export(
             model, args=example_inputs, dynamic_shapes={"x": {0: batch}}
         )
+
         engine_cache_dir = ENGINE_CACHE_DIR
         if os.path.exists(engine_cache_dir):
             shutil.rmtree(engine_cache_dir)
+
+        # The 1st iteration is to measure the compilation time without engine caching
+        # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+        # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+        # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
+        inputs = [torch.rand((128, 3, 224, 224)).to("cuda")]
+        results = []
+        times = []
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        for i in range(3):
+            if i == 0:
+                cache_built_engines = False
+                reuse_cached_engines = False
+            else:
+                cache_built_engines = True
+                reuse_cached_engines = True
+
+            start.record()
+            trt_gm = torch_trt.dynamo.compile(
+                exp_program,
+                tuple(inputs),
+                use_python_runtime=False,
+                enabled_precisions={torch.float},
+                debug=False,
+                min_block_size=1,
+                make_refitable=True,
+                cache_built_engines=cache_built_engines,
+                reuse_cached_engines=reuse_cached_engines,
+            )
+            end.record()
+            torch.cuda.synchronize()
+            times.append(start.elapsed_time(end))
+            results.append(trt_gm(*inputs))
+
+        cos_sim = cosine_similarity(results[0], results[1])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_dynamo_compile_with_default_disk_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+        cos_sim = cosine_similarity(results[1], results[2])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_dynamo_compile_with_default_disk_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+        assertions.assertTrue(
+            times[0] > times[2],
+            msg=f"test_dynamo_compile_with_default_disk_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
+        )
+
+    def test_dynamo_compile_with_custom_engine_cache(self):
+        model = models.resnet18(pretrained=True).eval().to("cuda")
+
+        engine_cache_dir = "/tmp/your_dir"
+        if os.path.exists(engine_cache_dir):
+            shutil.rmtree(engine_cache_dir)
+
+        custom_engine_cache = MyEngineCache(engine_cache_dir)
+
+        example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),)
+        # Mark the dim0 of inputs as dynamic
+        batch = torch.export.Dim("batch", min=1, max=200)
+        exp_program = torch.export.export(
+            model, args=example_inputs, dynamic_shapes={"x": {0: batch}}
+        )
+
         # The 1st iteration is to measure the compilation time without engine caching
         # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
         # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
         # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
         inputs = [torch.rand((128, 3, 224, 224)).to("cuda")]
         results = []
+        times = []
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
         for i in range(3):
             if i == 0:
                 cache_built_engines = False
@@ -74,6 +146,7 @@ def test_dynamo_compile(self):
                 cache_built_engines = True
                 reuse_cached_engines = True
 
+            start.record()
             trt_gm = torch_trt.dynamo.compile(
                 exp_program,
                 tuple(inputs),
@@ -84,23 +157,95 @@ def test_dynamo_compile(self):
                 make_refitable=True,
                 cache_built_engines=cache_built_engines,
                 reuse_cached_engines=reuse_cached_engines,
-                engine_cache_size=1 << 30,  # 1GB
+                custom_engine_cache=custom_engine_cache,
             )
+            end.record()
+            torch.cuda.synchronize()
+            times.append(start.elapsed_time(end))
             results.append(trt_gm(*inputs))
 
         cos_sim = cosine_similarity(results[0], results[1])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_dynamo_compile TRT without engine caching doesn't match with that with engine caching. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"test_dynamo_compile_with_custom_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
         )
 
         cos_sim = cosine_similarity(results[1], results[2])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_dynamo_compile TRT with engine caching doesn't match with that cached engine. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"test_dynamo_compile_with_custom_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
         )
 
-    def test_torch_compile(self):
+        assertions.assertTrue(
+            times[0] > times[2],
+            msg=f"test_dynamo_compile_with_custom_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
+        )
+
+    def test_torch_compile_with_default_disk_engine_cache(self):
+        # Custom Engine Cache
+        model = models.resnet18(pretrained=True).eval().to("cuda")
+
+        engine_cache_dir = "/tmp/test_torch_compile_with_default_disk_engine_cache"
+        if os.path.exists(engine_cache_dir):
+            shutil.rmtree(engine_cache_dir)
+
+        # The 1st iteration is to measure the compilation time without engine caching
+        # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+        # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+        # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
+        inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
+        results = []
+        times = []
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        for i in range(3):
+            # remove timing cache and reset dynamo for engine caching messurement
+            if i == 0:
+                cache_built_engines = False
+                reuse_cached_engines = False
+            else:
+                cache_built_engines = True
+                reuse_cached_engines = True
+
+            start.record()
+            compiled_model = torch.compile(
+                model,
+                backend="tensorrt",
+                options={
+                    "use_python_runtime": True,
+                    "enabled_precisions": {torch.float},
+                    "debug": False,
+                    "min_block_size": 1,
+                    "make_refitable": True,
+                    "cache_built_engines": cache_built_engines,
+                    "reuse_cached_engines": reuse_cached_engines,
+                    "engine_cache_dir": engine_cache_dir,
+                    "engine_cache_size": 1 << 30,  # 1GB
+                },
+            )
+            results.append(compiled_model(*inputs))  # trigger the compilation
+            end.record()
+            torch.cuda.synchronize()
+            times.append(start.elapsed_time(end))
+
+        cos_sim = cosine_similarity(results[0], results[1])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_torch_compile_with_default_disk_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+        cos_sim = cosine_similarity(results[1], results[2])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_torch_compile_with_default_disk_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+        assertions.assertTrue(
+            times[0] > times[2],
+            msg=f"test_torch_compile_with_default_disk_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
+        )
+
+    def test_torch_compile_with_custom_engine_cache(self):
         # Custom Engine Cache
         model = models.resnet18(pretrained=True).eval().to("cuda")
 
@@ -108,13 +253,16 @@ def test_torch_compile(self):
         if os.path.exists(engine_cache_dir):
             shutil.rmtree(engine_cache_dir)
 
-        engine_cache = MyEngineCache(engine_cache_dir)
+        custom_engine_cache = MyEngineCache(engine_cache_dir)
         # The 1st iteration is to measure the compilation time without engine caching
         # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
         # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
         # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
         inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
         results = []
+        times = []
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
         for i in range(3):
             # remove timing cache and reset dynamo for engine caching messurement
             if i == 0:
@@ -124,6 +272,7 @@ def test_torch_compile(self):
                 cache_built_engines = True
                 reuse_cached_engines = True
 
+            start.record()
             compiled_model = torch.compile(
                 model,
                 backend="tensorrt",
@@ -135,19 +284,27 @@ def test_torch_compile(self):
                     "make_refitable": True,
                     "cache_built_engines": cache_built_engines,
                     "reuse_cached_engines": reuse_cached_engines,
-                    "custom_engine_cache": engine_cache,  # use custom engine cache
+                    "custom_engine_cache": custom_engine_cache,
                 },
             )
             results.append(compiled_model(*inputs))  # trigger the compilation
+            end.record()
+            torch.cuda.synchronize()
+            times.append(start.elapsed_time(end))
 
         cos_sim = cosine_similarity(results[0], results[1])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_torch_compile TRT without engine caching doesn't match with that with engine caching. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"test_torch_compile_with_custom_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
         )
 
         cos_sim = cosine_similarity(results[1], results[2])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_torch_compile TRT with engine caching doesn't match with that cached engine. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"test_torch_compile_with_custom_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+        assertions.assertTrue(
+            times[0] > times[2],
+            msg=f"test_torch_compile_with_custom_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
         )

From f3e4234f2f5d710895b48d6205b5f799bbc4bd19 Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Wed, 28 Aug 2024 18:31:35 -0700
Subject: [PATCH 12/14] fix CI errors

---
 .../conversion/test_bitwise_and_aten.py       |  7 ++++-
 .../conversion/test_embedding_bag_aten.py     |  7 ++++-
 .../conversion/test_index_select_aten.py      |  7 ++++-
 tests/py/dynamo/models/test_dtype_support.py  | 14 ++++++++++
 tests/py/dynamo/models/test_dyn_models.py     | 14 ++++++++++
 tests/py/dynamo/models/test_engine_cache.py   | 26 ++++++++++---------
 .../dynamo/models/test_export_kwargs_serde.py | 14 ++++++++++
 tests/py/dynamo/models/test_export_serde.py   | 23 +++++++++++++++-
 tests/py/dynamo/models/test_models.py         | 10 +++++++
 tests/py/dynamo/models/test_models_export.py  | 14 ++++++++++
 10 files changed, 120 insertions(+), 16 deletions(-)

diff --git a/tests/py/dynamo/conversion/test_bitwise_and_aten.py b/tests/py/dynamo/conversion/test_bitwise_and_aten.py
index a29a8061db..c42fd2e61f 100644
--- a/tests/py/dynamo/conversion/test_bitwise_and_aten.py
+++ b/tests/py/dynamo/conversion/test_bitwise_and_aten.py
@@ -141,7 +141,12 @@ def forward(self, lhs_val, rhs_val):
             mod, inputs, dynamic_shapes=({1: dyn_dim}, {0: dyn_dim})
         )
         trt_mod = torch_tensorrt.dynamo.compile(
-            fx_mod, inputs=inputs, enable_precisions={torch.bool}, min_block_size=1
+            fx_mod,
+            inputs=inputs,
+            enable_precisions={torch.bool},
+            min_block_size=1,
+            cache_built_engines=False,
+            reuse_cached_engines=False,
         )
         with torch.no_grad():
             cuda_inputs = []
diff --git a/tests/py/dynamo/conversion/test_embedding_bag_aten.py b/tests/py/dynamo/conversion/test_embedding_bag_aten.py
index d935134ff2..3fef3d70cf 100644
--- a/tests/py/dynamo/conversion/test_embedding_bag_aten.py
+++ b/tests/py/dynamo/conversion/test_embedding_bag_aten.py
@@ -484,7 +484,12 @@ def forward(self, weights, indices, offsets, per_sample_weights=None):
             dynamic_shapes["per_sample_weights"] = {}
         fx_mod = torch.export.export(mod, inputs, dynamic_shapes=dynamic_shapes)
         trt_mod = torch_tensorrt.dynamo.compile(
-            fx_mod, inputs=inputs, enable_precisions=torch.float32, min_block_size=1
+            fx_mod,
+            inputs=inputs,
+            enable_precisions=torch.float32,
+            min_block_size=1,
+            cache_built_engines=False,
+            reuse_cached_engines=False,
         )
         # use the inputs with different shape to inference:
         if per_sample_weights is None:
diff --git a/tests/py/dynamo/conversion/test_index_select_aten.py b/tests/py/dynamo/conversion/test_index_select_aten.py
index 3d0b41b791..b1339efdcf 100644
--- a/tests/py/dynamo/conversion/test_index_select_aten.py
+++ b/tests/py/dynamo/conversion/test_index_select_aten.py
@@ -109,7 +109,12 @@ def forward(self, source_tensor, indice_tensor):
 
         fx_mod = torch.export.export(mod, inputs, dynamic_shapes=dynamic_shapes)
         trt_mod = torch_tensorrt.dynamo.compile(
-            fx_mod, inputs=inputs, enable_precisions=torch.float32, min_block_size=1
+            fx_mod,
+            inputs=inputs,
+            enable_precisions=torch.float32,
+            min_block_size=1,
+            cache_built_engines=False,
+            reuse_cached_engines=False,
         )
         # use different shape of inputs for inference:
         inputs = (source_tensor_1, indice_tensor)
diff --git a/tests/py/dynamo/models/test_dtype_support.py b/tests/py/dynamo/models/test_dtype_support.py
index 29faf4eff3..b486784e52 100644
--- a/tests/py/dynamo/models/test_dtype_support.py
+++ b/tests/py/dynamo/models/test_dtype_support.py
@@ -41,6 +41,8 @@ def forward(self, x):
             truncate_double=True,
             min_block_size=1,
             use_python_runtime=False,
+            cache_built_engines=False,
+            reuse_cached_engines=False,
         )
 
         torch_model_results = mod(in_tensor)
@@ -79,6 +81,8 @@ def forward(self, x):
             truncate_double=True,
             min_block_size=1,
             use_python_runtime=True,
+            cache_built_engines=False,
+            reuse_cached_engines=False,
         )
 
         torch_model_results = mod(in_tensor)
@@ -123,6 +127,8 @@ def forward(self, x):
             truncate_double=False,
             min_block_size=1,
             use_python_runtime=False,
+            cache_built_engines=False,
+            reuse_cached_engines=False,
         )
 
         torch_model_results = mod(in_tensor)
@@ -162,6 +168,8 @@ def forward(self, x):
             truncate_double=False,
             min_block_size=1,
             use_python_runtime=True,
+            cache_built_engines=False,
+            reuse_cached_engines=False,
         )
 
         torch_model_results = mod(in_tensor)
@@ -214,6 +222,8 @@ def forward(self, x):
             enabled_precisions={torch.float, torch.bfloat16, torch.half},
             min_block_size=1,
             use_python_runtime=False,
+            cache_built_engines=False,
+            reuse_cached_engines=False,
         )
 
         torch_model_results = mod(in_tensor)
@@ -252,6 +262,8 @@ def forward(self, x):
             enabled_precisions={torch.float, torch.bfloat16, torch.half},
             min_block_size=1,
             use_python_runtime=True,
+            cache_built_engines=False,
+            reuse_cached_engines=False,
         )
 
         torch_model_results = mod(in_tensor)
@@ -289,6 +301,8 @@ def forward(self, x):
                 debug=True,
                 min_block_size=1,
                 device=device,
+                cache_built_engines=False,
+                reuse_cached_engines=False,
             )
 
             torch_model_results = mod(*inputs)
diff --git a/tests/py/dynamo/models/test_dyn_models.py b/tests/py/dynamo/models/test_dyn_models.py
index 67eaddcc6c..d5627499f5 100644
--- a/tests/py/dynamo/models/test_dyn_models.py
+++ b/tests/py/dynamo/models/test_dyn_models.py
@@ -36,6 +36,8 @@ def forward(self, x):
         "ir": ir,
         "pass_through_build_failures": True,
         "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
     if ir == "torch_compile":
         input_bs4 = torch.randn((4, 3, 224, 224)).to("cuda")
@@ -90,6 +92,8 @@ def forward(self, x):
         "pass_through_build_failures": True,
         "torch_executed_ops": {"torch.ops.aten.abs.default"},
         "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     if ir == "torch_compile":
@@ -141,6 +145,8 @@ def forward(self, x):
         "ir": ir,
         "pass_through_build_failures": True,
         "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     if ir == "torch_compile":
@@ -184,6 +190,8 @@ def test_resnet_dynamic(ir):
         "ir": ir,
         "pass_through_build_failures": True,
         "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     if ir == "torch_compile":
@@ -246,6 +254,8 @@ def forward(self, x):
         "pass_through_build_failures": True,
         "optimization_level": 1,
         "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
@@ -278,6 +288,8 @@ def forward(self, x):
         "enabled_precisions": {torch.float},
         "ir": ir,
         "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
     inputs_bs2 = torch.randn(2, 2, 10).to("cuda")
     if ir == "torch_compile":
@@ -332,6 +344,8 @@ def forward(self, x):
         "pass_through_build_failures": True,
         "min_block_size": 1,
         "torch_executed_ops": {"torch.ops.aten.add.Tensor"},
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     # Compile the model
diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py
index 1a5b874eb4..24bb96c4f2 100644
--- a/tests/py/dynamo/models/test_engine_cache.py
+++ b/tests/py/dynamo/models/test_engine_cache.py
@@ -21,6 +21,8 @@ def __init__(
         engine_cache_dir: str,
     ) -> None:
         self.engine_cache_dir = engine_cache_dir
+        if not os.path.exists(self.engine_cache_dir):
+            os.makedirs(self.engine_cache_dir, exist_ok=True)
 
     def save(
         self,
@@ -99,18 +101,18 @@ def test_dynamo_compile_with_default_disk_engine_cache(self):
         cos_sim = cosine_similarity(results[0], results[1])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_dynamo_compile_with_default_disk_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
         )
 
         cos_sim = cosine_similarity(results[1], results[2])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_dynamo_compile_with_default_disk_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
         )
 
         assertions.assertTrue(
             times[0] > times[2],
-            msg=f"test_dynamo_compile_with_default_disk_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
+            msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
         )
 
     def test_dynamo_compile_with_custom_engine_cache(self):
@@ -167,18 +169,18 @@ def test_dynamo_compile_with_custom_engine_cache(self):
         cos_sim = cosine_similarity(results[0], results[1])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_dynamo_compile_with_custom_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
         )
 
         cos_sim = cosine_similarity(results[1], results[2])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_dynamo_compile_with_custom_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
         )
 
         assertions.assertTrue(
             times[0] > times[2],
-            msg=f"test_dynamo_compile_with_custom_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
+            msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
         )
 
     def test_torch_compile_with_default_disk_engine_cache(self):
@@ -231,18 +233,18 @@ def test_torch_compile_with_default_disk_engine_cache(self):
         cos_sim = cosine_similarity(results[0], results[1])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_torch_compile_with_default_disk_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
         )
 
         cos_sim = cosine_similarity(results[1], results[2])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_torch_compile_with_default_disk_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
         )
 
         assertions.assertTrue(
             times[0] > times[2],
-            msg=f"test_torch_compile_with_default_disk_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
+            msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
         )
 
     def test_torch_compile_with_custom_engine_cache(self):
@@ -295,16 +297,16 @@ def test_torch_compile_with_custom_engine_cache(self):
         cos_sim = cosine_similarity(results[0], results[1])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_torch_compile_with_custom_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
         )
 
         cos_sim = cosine_similarity(results[1], results[2])
         assertions.assertTrue(
             cos_sim > COSINE_THRESHOLD,
-            msg=f"test_torch_compile_with_custom_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            msg=f"results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
         )
 
         assertions.assertTrue(
             times[0] > times[2],
-            msg=f"test_torch_compile_with_custom_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
+            msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
         )
diff --git a/tests/py/dynamo/models/test_export_kwargs_serde.py b/tests/py/dynamo/models/test_export_kwargs_serde.py
index 08b23d55e0..52a927e518 100644
--- a/tests/py/dynamo/models/test_export_kwargs_serde.py
+++ b/tests/py/dynamo/models/test_export_kwargs_serde.py
@@ -63,6 +63,8 @@ def forward(self, x, b=5, c=None, d=None):
         "optimization_level": 1,
         "min_block_size": 1,
         "ir": "dynamo",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torch.export.export(model, args=tuple(args), kwargs=kwargs)
@@ -122,6 +124,8 @@ def forward(self, x, b=5, c=None, d=None):
         "optimization_level": 1,
         "min_block_size": 1,
         "ir": "dynamo",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -190,6 +194,8 @@ def forward(self, x, b=5, c=None, d=None):
         "optimization_level": 1,
         "min_block_size": 1,
         "ir": "dynamo",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -271,6 +277,8 @@ def forward(self, x, b=None, c=None, d=None, e=[]):
         "optimization_level": 1,
         "min_block_size": 1,
         "ir": "dynamo",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -358,6 +366,8 @@ def forward(self, x, b=None, c=None, d=None, e=[]):
         "optimization_level": 1,
         "min_block_size": 1,
         "ir": "dynamo",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -444,6 +454,8 @@ def forward(self, x, b=None, c=None, d=None, e=[]):
         "optimization_level": 1,
         "min_block_size": 1,
         "ir": "dynamo",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -505,6 +517,8 @@ def forward(self, x, b=5, c=None, d=None):
         "optimization_level": 1,
         "min_block_size": 1,
         "ir": "dynamo",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torch.export.export(model, args=tuple(args), kwargs=kwargs)
diff --git a/tests/py/dynamo/models/test_export_serde.py b/tests/py/dynamo/models/test_export_serde.py
index c0c0ba0f22..146cc2addf 100644
--- a/tests/py/dynamo/models/test_export_serde.py
+++ b/tests/py/dynamo/models/test_export_serde.py
@@ -42,6 +42,8 @@ def forward(self, x):
         ],
         "ir": ir,
         "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -94,6 +96,8 @@ def forward(self, x):
         ],
         "ir": ir,
         "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -150,6 +154,8 @@ def forward(self, x):
             )
         ],
         "ir": ir,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -209,6 +215,8 @@ def forward(self, x):
         "ir": ir,
         "min_block_size": 1,
         "torch_executed_ops": {"torch.ops.aten.relu.default"},
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -250,6 +258,8 @@ def test_resnet18(ir):
         ],
         "ir": ir,
         "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -293,6 +303,8 @@ def test_resnet18_dynamic(ir):
         ],
         "ir": ir,
         "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -340,6 +352,8 @@ def forward(self, x):
         "ir": ir,
         "min_block_size": 1,
         "torch_executed_ops": {"torch.ops.aten.convolution.default"},
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     exp_program = torchtrt.dynamo.trace(model, **compile_spec)
@@ -388,7 +402,14 @@ def forward(self, x):
     model = MyModule().eval().cuda()
     input = torch.randn((1, 3, 224, 224)).to("cuda")
 
-    trt_gm = torchtrt.compile(model, ir=ir, inputs=[input], min_block_size=1)
+    trt_gm = torchtrt.compile(
+        model,
+        ir=ir,
+        inputs=[input],
+        min_block_size=1,
+        cache_built_engines=False,
+        reuse_cached_engines=False,
+    )
     assertions.assertTrue(
         isinstance(trt_gm, torch.fx.GraphModule),
         msg=f"test_save_load_ts output type does not match with torch.fx.GraphModule",
diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py
index 2d45af2b49..ba6cb0c776 100644
--- a/tests/py/dynamo/models/test_models.py
+++ b/tests/py/dynamo/models/test_models.py
@@ -30,6 +30,8 @@ def test_resnet18(ir):
         "pass_through_build_failures": True,
         "optimization_level": 1,
         "ir": "torch_compile",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
@@ -61,6 +63,8 @@ def test_mobilenet_v2(ir):
         "optimization_level": 1,
         "min_block_size": 10,
         "ir": "torch_compile",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
@@ -92,6 +96,8 @@ def test_efficientnet_b0(ir):
         "optimization_level": 1,
         "min_block_size": 10,
         "ir": "torch_compile",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
@@ -132,6 +138,8 @@ def test_bert_base_uncased(ir):
         "optimization_level": 1,
         "min_block_size": 15,
         "ir": "torch_compile",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
     trt_mod = torchtrt.compile(model, **compile_spec)
 
@@ -166,6 +174,8 @@ def test_resnet18_half(ir):
         "pass_through_build_failures": True,
         "optimization_level": 1,
         "ir": "torch_compile",
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py
index df71d6b58a..bf19c3c5e6 100644
--- a/tests/py/dynamo/models/test_models_export.py
+++ b/tests/py/dynamo/models/test_models_export.py
@@ -31,6 +31,8 @@ def test_resnet18(ir):
         "pass_through_build_failures": True,
         "optimization_level": 1,
         "min_block_size": 8,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
@@ -61,6 +63,8 @@ def test_mobilenet_v2(ir):
         "pass_through_build_failures": True,
         "optimization_level": 1,
         "min_block_size": 8,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
@@ -91,6 +95,8 @@ def test_efficientnet_b0(ir):
         "pass_through_build_failures": True,
         "optimization_level": 1,
         "min_block_size": 8,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
@@ -130,6 +136,8 @@ def test_bert_base_uncased(ir):
         "truncate_double": True,
         "ir": ir,
         "min_block_size": 10,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
     trt_mod = torchtrt.compile(model, **compile_spec)
     model_outputs = model(input, input2)
@@ -168,6 +176,8 @@ def test_resnet18_half(ir):
         "pass_through_build_failures": True,
         "optimization_level": 1,
         "min_block_size": 8,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
@@ -223,6 +233,8 @@ def calibrate_loop(model):
                 enabled_precisions={torch.float8_e4m3fn},
                 min_block_size=1,
                 debug=True,
+                cache_built_engines=False,
+                reuse_cached_engines=False,
             )
             outputs_trt = trt_model(input_tensor)
             assert torch.allclose(output_pyt, outputs_trt, rtol=1e-3, atol=1e-2)
@@ -272,6 +284,8 @@ def calibrate_loop(model):
                 enabled_precisions={torch.int8},
                 min_block_size=1,
                 debug=True,
+                cache_built_engines=False,
+                reuse_cached_engines=False,
             )
             outputs_trt = trt_model(input_tensor)
             assert torch.allclose(output_pyt, outputs_trt, rtol=1e-3, atol=1e-2)

From 42d18ac1402e87b19d98ba1d85e96d8bcb79cb2a Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Wed, 28 Aug 2024 22:23:01 -0700
Subject: [PATCH 13/14] fix CI errors

---
 tests/py/dynamo/models/test_engine_cache.py          |  7 +++++++
 tests/py/dynamo/runtime/test_001_streams.py          |  2 ++
 tests/py/dynamo/runtime/test_002_lazy_engine_init.py | 10 ++++++++++
 3 files changed, 19 insertions(+)

diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py
index 24bb96c4f2..46916f7c62 100644
--- a/tests/py/dynamo/models/test_engine_cache.py
+++ b/tests/py/dynamo/models/test_engine_cache.py
@@ -4,6 +4,7 @@
 import unittest
 from typing import Optional
 
+import pytest
 import torch
 import torch_tensorrt as torch_trt
 import torchvision.models as models
@@ -183,6 +184,9 @@ def test_dynamo_compile_with_custom_engine_cache(self):
             msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
         )
 
+    @pytest.mark.skip(
+        reason="The test needs a fix for refit, which is reported in https://github.com/pytorch/TensorRT/issues/3126"
+    )
     def test_torch_compile_with_default_disk_engine_cache(self):
         # Custom Engine Cache
         model = models.resnet18(pretrained=True).eval().to("cuda")
@@ -247,6 +251,9 @@ def test_torch_compile_with_default_disk_engine_cache(self):
             msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
         )
 
+    @pytest.mark.skip(
+        reason="The test needs a fix for refit, which is reported in https://github.com/pytorch/TensorRT/issues/3126"
+    )
     def test_torch_compile_with_custom_engine_cache(self):
         # Custom Engine Cache
         model = models.resnet18(pretrained=True).eval().to("cuda")
diff --git a/tests/py/dynamo/runtime/test_001_streams.py b/tests/py/dynamo/runtime/test_001_streams.py
index 574db6611e..aaec9e3d41 100644
--- a/tests/py/dynamo/runtime/test_001_streams.py
+++ b/tests/py/dynamo/runtime/test_001_streams.py
@@ -31,6 +31,8 @@ def forward(self, x):
                 enabled_precisions={dtype},
                 min_block_size=1,
                 device=device,
+                cache_built_engines=False,
+                reuse_cached_engines=False,
             )
 
             for i in range(100):
diff --git a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py
index 1f3de69eb3..008b0f53b1 100644
--- a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py
+++ b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py
@@ -160,6 +160,8 @@ def test_lazy_engine_init_py_e2e(self):
             "ir": "dynamo",
             "lazy_engine_init": True,
             "use_python_runtime": True,
+            "cache_built_engines": False,
+            "reuse_cached_engines": False,
         }
 
         trt_mod = torchtrt.compile(model, **compile_spec)
@@ -194,6 +196,8 @@ def test_lazy_engine_init_cpp_e2e(self):
             "ir": "dynamo",
             "lazy_engine_init": True,
             "use_python_runtime": False,
+            "cache_built_engines": False,
+            "reuse_cached_engines": False,
         }
 
         trt_mod = torchtrt.compile(model, **compile_spec)
@@ -228,6 +232,8 @@ def test_lazy_engine_init_cpp_serialization(self):
             "ir": "dynamo",
             "lazy_engine_init": True,
             "use_python_runtime": False,
+            "cache_built_engines": False,
+            "reuse_cached_engines": False,
         }
 
         trt_mod = torchtrt.compile(model, **compile_spec)
@@ -276,6 +282,8 @@ def forward(self, a, b):
             "lazy_engine_init": True,
             "use_python_runtime": True,
             "torch_executed_ops": [torch.ops.aten.sub.Tensor],
+            "cache_built_engines": False,
+            "reuse_cached_engines": False,
         }
 
         trt_mod = torchtrt.dynamo.compile(exp_program, **compile_spec)
@@ -318,6 +326,8 @@ def forward(self, a, b):
             "lazy_engine_init": True,
             "use_python_runtime": False,
             "torch_executed_ops": [torch.ops.aten.sub.Tensor],
+            "cache_built_engines": False,
+            "reuse_cached_engines": False,
         }
 
         trt_mod = torchtrt.dynamo.compile(exp_program, **compile_spec)

From fc525e6068be472f58794e355d5fdb31f22b9c48 Mon Sep 17 00:00:00 2001
From: Evan Li <zewenl@nvidia.com>
Date: Thu, 29 Aug 2024 11:45:20 -0700
Subject: [PATCH 14/14] fix refit issue for torch.compile

---
 py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py | 5 ++---
 tests/py/dynamo/models/test_engine_cache.py            | 6 ------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index 22743af0aa..3c97c8347a 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -40,7 +40,7 @@
     get_node_name,
     get_trt_tensor,
 )
-from torch_tensorrt.dynamo.utils import DYNAMIC_DIM, to_torch_device
+from torch_tensorrt.dynamo.utils import DYNAMIC_DIM, get_model_device, to_torch_device
 from torch_tensorrt.fx.observer import Observer
 from torch_tensorrt.logging import TRT_LOGGER
 
@@ -434,9 +434,8 @@ def _save_weight_mapping(self) -> None:
         """
         _LOGGER.info("Building weight name mapping...")
         # Stage 1: Name mapping
-        sd = self.module.state_dict()
         torch_device = to_torch_device(self.compilation_settings.device)
-        gm_is_on_cuda = list(sd.values())[0].device.type == "cuda"
+        gm_is_on_cuda = get_model_device(self.module).type == "cuda"
         if not gm_is_on_cuda:
             # If the model original position is on CPU, move it GPU
             sd = {
diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py
index 46916f7c62..189a492d4e 100644
--- a/tests/py/dynamo/models/test_engine_cache.py
+++ b/tests/py/dynamo/models/test_engine_cache.py
@@ -184,9 +184,6 @@ def test_dynamo_compile_with_custom_engine_cache(self):
             msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
         )
 
-    @pytest.mark.skip(
-        reason="The test needs a fix for refit, which is reported in https://github.com/pytorch/TensorRT/issues/3126"
-    )
     def test_torch_compile_with_default_disk_engine_cache(self):
         # Custom Engine Cache
         model = models.resnet18(pretrained=True).eval().to("cuda")
@@ -251,9 +248,6 @@ def test_torch_compile_with_default_disk_engine_cache(self):
             msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms",
         )
 
-    @pytest.mark.skip(
-        reason="The test needs a fix for refit, which is reported in https://github.com/pytorch/TensorRT/issues/3126"
-    )
     def test_torch_compile_with_custom_engine_cache(self):
         # Custom Engine Cache
         model = models.resnet18(pretrained=True).eval().to("cuda")