From 1d60e5363eff02d0b1a70c6c64b6da5c6bb59009 Mon Sep 17 00:00:00 2001 From: Evan Li Date: Wed, 10 Jul 2024 14:48:22 -0700 Subject: [PATCH 01/14] feat: engine caching revert backend changes update dynamo path add save_engine_cache and load_engine_cache args support customizing engine cache class refactor and add LRU to clear cache fix bug --- examples/dynamo/engine_caching_example.py | 174 ++++++++++++++ py/torch_tensorrt/dynamo/_compiler.py | 39 ++++ py/torch_tensorrt/dynamo/_defaults.py | 12 +- py/torch_tensorrt/dynamo/_engine_caching.py | 212 ++++++++++++++++++ py/torch_tensorrt/dynamo/_settings.py | 16 ++ py/torch_tensorrt/dynamo/backend/backends.py | 2 +- .../dynamo/conversion/_TRTInterpreter.py | 40 ++++ 7 files changed, 493 insertions(+), 2 deletions(-) create mode 100644 examples/dynamo/engine_caching_example.py create mode 100644 py/torch_tensorrt/dynamo/_engine_caching.py diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py new file mode 100644 index 0000000000..a7b8f02f7a --- /dev/null +++ b/examples/dynamo/engine_caching_example.py @@ -0,0 +1,174 @@ +import ast +import logging +import os +from typing import List, Optional, Tuple + +import numpy as np +import torch +import torch_tensorrt as torch_trt +import torchvision.models as models +from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH +from torch_tensorrt.dynamo._engine_caching import BaseEngineCache + +_LOGGER: logging.Logger = logging.getLogger(__name__) + + +np.random.seed(0) +torch.manual_seed(0) +size = (100, 3, 224, 224) + +model = models.resnet18(pretrained=True).eval().to("cuda") +enabled_precisions = {torch.float} +debug = False +min_block_size = 1 +use_python_runtime = False + + +def remove_timing_cache(path=TIMING_CACHE_PATH): + if os.path.exists(path): + os.remove(path) + + +def dynamo_path(iterations=3): + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) + # Mark the dim0 of inputs as dynamic + batch = torch.export.Dim("batch", min=1, max=200) + exp_program = torch.export.export( + model, args=example_inputs, dynamic_shapes={"x": {0: batch}} + ) + + for i in range(iterations): + inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")] + remove_timing_cache() # remove timing cache for engine caching messurement + if i == 0: + save_engine_cache = False + load_engine_cache = False + else: + save_engine_cache = True + load_engine_cache = True + + start.record() + trt_gm = torch_trt.dynamo.compile( + exp_program, + tuple(inputs), + use_python_runtime=use_python_runtime, + enabled_precisions=enabled_precisions, + debug=debug, + min_block_size=min_block_size, + make_refitable=True, + save_engine_cache=save_engine_cache, + load_engine_cache=load_engine_cache, + engine_cache_size=1 << 30, # 1GB + ) + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + print("-----dynamo_path-----> compilation time:", times, "milliseconds") + + +# Custom Engine Cache +class MyEngineCache(BaseEngineCache): + + def __init__( + self, + engine_cache_size: int, + engine_cache_dir: str, + ) -> None: + self.total_engine_cache_size = engine_cache_size + self.available_engine_cache_size = engine_cache_size + self.engine_cache_dir = engine_cache_dir + + def save( + self, + hash: str, + serialized_engine: bytes, + input_names: List[str], + output_names: List[str], + ) -> bool: + path = os.path.join( + self.engine_cache_dir, + f"{hash}/engine--{input_names}--{output_names}.trt", + ) + try: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as f: + f.write(serialized_engine) + except Exception as e: + _LOGGER.warning(f"Failed to save the TRT engine to {path}: {e}") + return False + + _LOGGER.info(f"A TRT engine was cached to {path}") + serialized_engine_size = int(serialized_engine.nbytes) + self.available_engine_cache_size -= serialized_engine_size + return True + + def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]: + directory = os.path.join(self.engine_cache_dir, hash) + if os.path.exists(directory): + engine_list = os.listdir(directory) + assert ( + len(engine_list) == 1 + ), f"There are more than one engine {engine_list} under {directory}." + path = os.path.join(directory, engine_list[0]) + input_names_str, output_names_str = ( + engine_list[0].split(".trt")[0].split("--")[1:] + ) + input_names = ast.literal_eval(input_names_str) + output_names = ast.literal_eval(output_names_str) + with open(path, "rb") as f: + serialized_engine = f.read() + return serialized_engine, input_names, output_names + else: + return None, [], [] + + +def compile_path(iterations=3): + times = [] + engine_cache = MyEngineCache(200 * (1 << 20), "/tmp/your_dir") + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + for i in range(iterations): + inputs = [torch.rand(size).to("cuda")] + # remove timing cache and reset dynamo for engine caching messurement + remove_timing_cache() + torch._dynamo.reset() + + if i == 0: + save_engine_cache = False + load_engine_cache = False + else: + save_engine_cache = True + load_engine_cache = True + + start.record() + compiled_model = torch.compile( + model, + backend="tensorrt", + options={ + "use_python_runtime": use_python_runtime, + "enabled_precisions": enabled_precisions, + "debug": debug, + "min_block_size": min_block_size, + "make_refitable": True, + "save_engine_cache": save_engine_cache, + "load_engine_cache": load_engine_cache, + "engine_cache_instance": engine_cache, # use custom engine cache + }, + ) + compiled_model(*inputs) # trigger the compilation + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + print("-----compile_path-----> compilation time:", times, "milliseconds") + + +if __name__ == "__main__": + dynamo_path() + compile_path() diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index a4849f257e..229ecb5ef7 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -18,6 +18,7 @@ dryrun_stats_display, parse_non_trt_nodes, ) +from torch_tensorrt.dynamo._engine_caching import BaseEngineCache, EngineCache from torch_tensorrt.dynamo.conversion import ( CompilationSettings, UnsupportedOperatorException, @@ -82,6 +83,11 @@ def compile( hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE, timing_cache_path: str = _defaults.TIMING_CACHE_PATH, lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT, + save_engine_cache: bool = _defaults.SAVE_ENGINE_CACHE, + load_engine_cache: bool = _defaults.LOAD_ENGINE_CACHE, + engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR, + engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE, + engine_cache_instance: Optional[BaseEngineCache] = None, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT @@ -147,6 +153,11 @@ def compile( hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime. + save_engine_cache (bool): Whether to save the compiled TRT engines to hard disk + load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk + engine_cache_dir (str): Directory to store the cached TRT engines + engine_cache_size (int): Maximum hard-disk space to use for the engine cache + engine_cache_instance (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -224,6 +235,11 @@ def compile( gm = post_lowering(gm) logger.debug("Lowered Input graph: " + str(gm.graph)) + if engine_cache_instance is None: + engine_cache_instance = EngineCacheInstanceCreator.get_creator( + engine_cache_size, engine_cache_dir + ).engine_cache_instance + compilation_options = { "enabled_precisions": ( enabled_precisions if enabled_precisions else _defaults.ENABLED_PRECISIONS @@ -257,6 +273,11 @@ def compile( "hardware_compatible": hardware_compatible, "timing_cache_path": timing_cache_path, "lazy_engine_init": lazy_engine_init, + "save_engine_cache": save_engine_cache, + "load_engine_cache": load_engine_cache, + "engine_cache_dir": engine_cache_dir, + "engine_cache_size": engine_cache_size, + "engine_cache_instance": engine_cache_instance, } settings = CompilationSettings(**compilation_options) @@ -665,3 +686,21 @@ def convert_exported_program_to_serialized_trt_engine( serialized_engine: bytes = interpreter_result.serialized_engine return serialized_engine + + +class EngineCacheInstanceCreator: + engine_cache_creator = None + + def __init__(self, engine_cache_size: int, engine_cache_dir: str) -> None: + self.engine_cache_instance = EngineCache( + engine_cache_size=engine_cache_size, + engine_cache_dir=engine_cache_dir, + ) + + @classmethod + def get_creator( + cls, engine_cache_size: int, engine_cache_dir: str + ) -> EngineCacheInstanceCreator: + if cls.engine_cache_creator is None: + cls.engine_cache_creator = cls(engine_cache_size, engine_cache_dir) + return cls.engine_cache_creator diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 2696e26936..e90f3f8c2a 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -4,6 +4,7 @@ import torch from torch_tensorrt._Device import Device from torch_tensorrt._enums import EngineCapability, dtype +from torch_tensorrt.dynamo._engine_caching import EngineCache ENABLED_PRECISIONS = {dtype.f32} DEBUG = False @@ -31,8 +32,17 @@ DRYRUN = False HARDWARE_COMPATIBLE = False SUPPORTED_KERNEL_PRECISIONS = {dtype.f32, dtype.f16, dtype.bf16, dtype.i8, dtype.f8} -TIMING_CACHE_PATH = os.path.join(tempfile.gettempdir(), "timing_cache.bin") +TIMING_CACHE_PATH = os.path.join( + tempfile.gettempdir(), "torch_tensorrt_engine_cache", "timing_cache.bin" +) LAZY_ENGINE_INIT = False +SAVE_ENGINE_CACHE = True +LOAD_ENGINE_CACHE = True +ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache") +ENGINE_CACHE_SIZE = 1073741824 +ENGINE_CACHE_INSTANCE = EngineCache( + engine_cache_size=ENGINE_CACHE_SIZE, engine_cache_dir=ENGINE_CACHE_DIR +) def default_device() -> Device: diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py new file mode 100644 index 0000000000..f491bc2523 --- /dev/null +++ b/py/torch_tensorrt/dynamo/_engine_caching.py @@ -0,0 +1,212 @@ +import ast +import copy +import logging +import os +import shutil +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Tuple, cast + +import torch +from torch._inductor.codecache import FxGraphCachePickler +from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode + +_LOGGER: logging.Logger = logging.getLogger(__name__) + + +class BaseEngineCache(ABC): + + @abstractmethod + def __init__( + self, + *args: Any, + **kwargs: Any, + ) -> None: + pass + + @staticmethod + def get_hash(gm: torch.fx.GraphModule) -> str: + """Get the hash value of the GraphModule + + Args: + gm (torch.fx.GraphModule): GraphModule to hash + + Returns: + str: hash value of the GraphModule + """ + # parameters are set to 0 + with maybe_disable_fake_tensor_mode(): + new_gm = copy.deepcopy(gm) + for name, param in new_gm.named_parameters(): + param.data.zero_() + + hash_val = cast(str, FxGraphCachePickler.get_hash(new_gm)) + + return hash_val + + @abstractmethod + def save( + self, + hash: str, + serialized_engine: bytes, + input_names: List[str], + output_names: List[str], + ) -> bool: + """Save the serialized engine to hard disk + + Args: + hash (str): hash value of the GraphModule + serialized_engine (bytes): serialized TRT engine + input_names (List[str]): input names of TRT engine + output_names (List[str]): output names of TRT engine + + Returns: + bool: whether the serialized engine is saved successfully + """ + pass + + @abstractmethod + def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]: + """Load the serialized engine from hard disk + + Args: + hash (str): hash value of the GraphModule + + Returns: + Sequence[Optional[bytes], List[str], List[str]]: serialized TRT engine, input names of TRT Engine, output names of TRT Engine + """ + pass + + +class EngineCache(BaseEngineCache): + + def __init__( + self, + engine_cache_size: int, + engine_cache_dir: str, + ) -> None: + self.total_engine_cache_size = engine_cache_size + self.available_engine_cache_size = engine_cache_size + self.engine_cache_dir = engine_cache_dir + self.hash2size_map: Dict[str, int] = {} + + def has_available_cache_size(self, serialized_engine: bytes) -> bool: + """Check if the cache has available space for saving the serialized engine + + Args: + serialized_engine (bytes): serialized TRT engine + + Returns: + bool: whether the cache has available size for the serialized engine + """ + return int(serialized_engine.nbytes) <= self.available_engine_cache_size + + def clear_cache(self, needed_min_size: int) -> bool: + """Clear the cache to make sure at least `needed_min_size` bytes are available, if possible + + Args: + needed_min_size (int): the minimum needed size + + Returns: + bool: whether the cache is cleared successfully + """ + + def LRU() -> bool: + """Clear the Least Recently Used engine in the cache""" + # Get the list of engine directories + engines_hash_values = os.listdir(self.engine_cache_dir) + # Sort the engine directories by modification time (oldest first) + engines_hash_values.sort( + key=lambda x: os.path.getmtime(os.path.join(self.engine_cache_dir, x)) + ) + # Iterate over the engine directories and remove the oldest ones until enough space is available + for engine_hash in engines_hash_values: + if self.available_engine_cache_size >= needed_min_size: + break + engine_path = os.path.join(self.engine_cache_dir, engine_hash) + try: + # Remove the entire directory + shutil.rmtree(engine_path) + # Update the available cache size + self.available_engine_cache_size += self.hash2size_map.pop( + engine_hash, 0 + ) + _LOGGER.info( + f"Removed the engine cache at {engine_path}, available cache size: {self.available_engine_cache_size} bytes." + ) + except Exception as e: + _LOGGER.warning( + f"Failed to clear the engine cache at {engine_path}: {e}" + ) + return False + return True + + if not os.path.exists(self.engine_cache_dir): + return False + + _LOGGER.info( + f"Total cache size: {self.total_engine_cache_size} bytes; available cache size: {self.available_engine_cache_size} bytes. Clearing the cache to make sure at least {needed_min_size} bytes are available." + ) + return LRU() + + def save( + self, + hash: str, + serialized_engine: bytes, + input_names: List[str], + output_names: List[str], + ) -> bool: + serialized_engine_size = int(serialized_engine.nbytes) + if serialized_engine_size > self.total_engine_cache_size: + _LOGGER.warning( + f"The serialized engine cannot be saved because the size of the engine {serialized_engine_size} is larger than the total cache size {self.total_engine_cache_size}." + ) + return False + + # Check if there is enough available cache size for the serialized engine + if not self.has_available_cache_size(serialized_engine): + self.clear_cache(serialized_engine_size) + + # Save the serialized engine to the cache directory + if self.has_available_cache_size(serialized_engine): + path = os.path.join( + self.engine_cache_dir, + f"{hash}/engine--{input_names}--{output_names}.trt", + ) + try: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as f: + f.write(serialized_engine) + self.hash2size_map[hash] = serialized_engine_size + self.available_engine_cache_size -= serialized_engine_size + _LOGGER.info(f"A TRT engine was cached to {path}") + + except Exception as e: + _LOGGER.warning(f"Failed to save the TRT engine to {path}: {e}") + return False + + return True + + else: + _LOGGER.warning( + f"The serialized engine {serialized_engine_size} is still larger than the available cache size {self.available_engine_cache_size}." + ) + return False + + def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]: + directory = os.path.join(self.engine_cache_dir, hash) + if os.path.exists(directory): + engine_list = os.listdir(directory) + assert ( + len(engine_list) == 1 + ), f"There are more than one engine {engine_list} under {directory}." + path = os.path.join(directory, engine_list[0]) + input_names_str, output_names_str = ( + engine_list[0].split(".trt")[0].split("--")[1:] + ) + input_names = ast.literal_eval(input_names_str) + output_names = ast.literal_eval(output_names_str) + with open(path, "rb") as f: + serialized_engine = f.read() + return serialized_engine, input_names, output_names + else: + return None, [], [] diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 4a9792d3dc..90c17d03c3 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -14,9 +14,13 @@ DRYRUN, ENABLE_EXPERIMENTAL_DECOMPOSITIONS, ENABLED_PRECISIONS, + ENGINE_CACHE_DIR, + ENGINE_CACHE_INSTANCE, + ENGINE_CACHE_SIZE, ENGINE_CAPABILITY, HARDWARE_COMPATIBLE, LAZY_ENGINE_INIT, + LOAD_ENGINE_CACHE, MAKE_REFITABLE, MAX_AUX_STREAMS, MIN_BLOCK_SIZE, @@ -24,6 +28,7 @@ OPTIMIZATION_LEVEL, PASS_THROUGH_BUILD_FAILURES, REQUIRE_FULL_COMPILATION, + SAVE_ENGINE_CACHE, SPARSE_WEIGHTS, TIMING_CACHE_PATH, TRUNCATE_DOUBLE, @@ -33,6 +38,7 @@ WORKSPACE_SIZE, default_device, ) +from torch_tensorrt.dynamo._engine_caching import BaseEngineCache @dataclass @@ -74,6 +80,11 @@ class CompilationSettings: output to a file if a string path is specified hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation + save_engine_cache (bool): Whether to save the compiled TRT engines to hard disk + load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk + engine_cache_dir (str): Directory to store the cached TRT engines + engine_cache_size (int): Maximum hard-disk space to use for the engine cache + engine_cache_instance (BaseEngineCache): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache """ enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS) @@ -106,3 +117,8 @@ class CompilationSettings: hardware_compatible: bool = HARDWARE_COMPATIBLE timing_cache_path: str = TIMING_CACHE_PATH lazy_engine_init: bool = LAZY_ENGINE_INIT + save_engine_cache: bool = SAVE_ENGINE_CACHE + load_engine_cache: bool = LOAD_ENGINE_CACHE + engine_cache_dir: str = ENGINE_CACHE_DIR + engine_cache_size: int = ENGINE_CACHE_SIZE + engine_cache_instance: BaseEngineCache = ENGINE_CACHE_INSTANCE diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index ae3cb38f2d..e34f37a9b8 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -98,7 +98,7 @@ def _pretraced_backend( logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph)) - gm = post_lowering(gm) + gm = post_lowering(gm, sample_inputs) logger.debug("Lowered Input graph:\n " + str(gm.graph)) diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 9fef61961b..ccb602e8dc 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -323,6 +323,7 @@ def _save_timing_cache( This is called after a TensorRT engine is built. Save the timing cache """ timing_cache = builder_config.get_timing_cache() + os.makedirs(os.path.dirname(timing_cache_path), exist_ok=True) with open(timing_cache_path, "wb") as timing_cache_file: timing_cache_file.write(memoryview(timing_cache.serialize())) @@ -516,15 +517,50 @@ def run( Args: strict_type_constraints: Usually we should set it to False unless we want to control the precision of certain layer for numeric reasons. algorithm_selector: set up algorithm selection for certain layer + tactic_sources: set up tactic sources for certain layer Return: TRTInterpreterResult """ + if ( + self.compilation_settings.save_engine_cache + or self.compilation_settings.load_engine_cache + ): + engine_cache = self.compilation_settings.engine_cache_instance + hash_val = engine_cache.get_hash(self.module) + + if self.compilation_settings.load_engine_cache: + # query the cached TRT engine + serialized_engine, input_names, output_names = engine_cache.load(hash_val) + if serialized_engine is not None: + self._input_names = input_names + self._output_names = output_names + _LOGGER.info( + "Hit the cached TRT engine. It is loaded for skipping recompilation." + ) + + # refit the engine + from torch_tensorrt.dynamo._refit import ( + _refit_single_trt_engine_with_gm, + ) + + runtime = trt.Runtime(TRT_LOGGER) + engine = runtime.deserialize_cuda_engine(serialized_engine) + _refit_single_trt_engine_with_gm( + self.module, engine, self.input_specs, self.compilation_settings + ) + _LOGGER.info("Refitting Succeed!") + + return TRTInterpreterResult( + serialized_engine, self._input_names, self._output_names + ) + self._construct_trt_network_def() if self.compilation_settings.make_refitable: self._save_weight_mapping() build_engine_start_time = datetime.now() + _LOGGER.info("Not found cached TRT engines. Start building engine.") builder_config = self._populate_trt_builder_config( strict_type_constraints, algorithm_selector, tactic_sources @@ -547,6 +583,10 @@ def run( self._save_timing_cache( builder_config, self.compilation_settings.timing_cache_path ) + if self.compilation_settings.save_engine_cache: + engine_cache.save( + hash_val, serialized_engine, self._input_names, self._output_names + ) with io.BytesIO() as engine_bytes: engine_bytes.write(serialized_engine) From f3d10848e29b4b185edaec61a6ff562f21a18f20 Mon Sep 17 00:00:00 2001 From: Evan Li Date: Wed, 7 Aug 2024 11:45:12 -0700 Subject: [PATCH 02/14] rebase --- py/torch_tensorrt/dynamo/backend/backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index e34f37a9b8..ae3cb38f2d 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -98,7 +98,7 @@ def _pretraced_backend( logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph)) - gm = post_lowering(gm, sample_inputs) + gm = post_lowering(gm) logger.debug("Lowered Input graph:\n " + str(gm.graph)) From 1e5b501709a852e18eb0b114439d966576cd26a6 Mon Sep 17 00:00:00 2001 From: Evan Li Date: Thu, 8 Aug 2024 18:25:45 -0700 Subject: [PATCH 03/14] add comments --- examples/dynamo/engine_caching_example.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py index a7b8f02f7a..1bfbb4dd44 100644 --- a/examples/dynamo/engine_caching_example.py +++ b/examples/dynamo/engine_caching_example.py @@ -41,6 +41,10 @@ def dynamo_path(iterations=3): model, args=example_inputs, dynamic_shapes={"x": {0: batch}} ) + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. for i in range(iterations): inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")] remove_timing_cache() # remove timing cache for engine caching messurement @@ -133,6 +137,10 @@ def compile_path(iterations=3): start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. for i in range(iterations): inputs = [torch.rand(size).to("cuda")] # remove timing cache and reset dynamo for engine caching messurement From bc0a8c055953a8b7a04092491dbc81e10cd3b9ad Mon Sep 17 00:00:00 2001 From: Evan Li Date: Mon, 12 Aug 2024 14:08:55 -0700 Subject: [PATCH 04/14] add bert example --- .../dynamo/engine_caching_bert_example.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 examples/dynamo/engine_caching_bert_example.py diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py new file mode 100644 index 0000000000..f4635f5f5d --- /dev/null +++ b/examples/dynamo/engine_caching_bert_example.py @@ -0,0 +1,64 @@ +import numpy as np +import torch +import torch_tensorrt +from engine_caching_example import remove_timing_cache +from transformers import BertModel + +np.random.seed(0) +torch.manual_seed(0) + +model = BertModel.from_pretrained("bert-base-uncased", return_dict=False).cuda().eval() +inputs = [ + torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), + torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), +] + + +def compile_bert(iterations=3): + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + for i in range(iterations): + # remove timing cache and reset dynamo for engine caching messurement + remove_timing_cache() + torch._dynamo.reset() + + if i == 0: + save_engine_cache = False + load_engine_cache = False + else: + save_engine_cache = True + load_engine_cache = True + + start.record() + compilation_kwargs = { + "use_python_runtime": False, + "enabled_precisions": {torch.float}, + "truncate_double": True, + "debug": True, + "min_block_size": 1, + "make_refitable": True, + "save_engine_cache": save_engine_cache, + "load_engine_cache": load_engine_cache, + "engine_cache_size": 1 << 30, # 1GB + } + optimized_model = torch.compile( + model, + backend="torch_tensorrt", + options=compilation_kwargs, + ) + optimized_model(*inputs) + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + print("-----compile bert-----> compilation time:", times, "milliseconds") + + +if __name__ == "__main__": + compile_bert() From 79544e39d9b1e5ffbcd747a046fc17c018a1ed5e Mon Sep 17 00:00:00 2001 From: Evan Li Date: Wed, 14 Aug 2024 02:43:14 -0700 Subject: [PATCH 05/14] support saving weight name map --- .../dynamo/engine_caching_bert_example.py | 4 +- examples/dynamo/engine_caching_example.py | 5 +- py/torch_tensorrt/dynamo/_engine_caching.py | 124 +++++++++++++----- .../dynamo/conversion/_TRTInterpreter.py | 29 ++-- 4 files changed, 107 insertions(+), 55 deletions(-) diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py index f4635f5f5d..2f133f5e8f 100644 --- a/examples/dynamo/engine_caching_bert_example.py +++ b/examples/dynamo/engine_caching_bert_example.py @@ -40,7 +40,7 @@ def compile_bert(iterations=3): "use_python_runtime": False, "enabled_precisions": {torch.float}, "truncate_double": True, - "debug": True, + "debug": False, "min_block_size": 1, "make_refitable": True, "save_engine_cache": save_engine_cache, @@ -57,7 +57,7 @@ def compile_bert(iterations=3): torch.cuda.synchronize() times.append(start.elapsed_time(end)) - print("-----compile bert-----> compilation time:", times, "milliseconds") + print("-----compile bert-----> compilation time:\n", times, "milliseconds") if __name__ == "__main__": diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py index 1bfbb4dd44..80cf696466 100644 --- a/examples/dynamo/engine_caching_example.py +++ b/examples/dynamo/engine_caching_example.py @@ -72,12 +72,11 @@ def dynamo_path(iterations=3): torch.cuda.synchronize() times.append(start.elapsed_time(end)) - print("-----dynamo_path-----> compilation time:", times, "milliseconds") + print("-----dynamo_path-----> compilation time:\n", times, "milliseconds") # Custom Engine Cache class MyEngineCache(BaseEngineCache): - def __init__( self, engine_cache_size: int, @@ -174,7 +173,7 @@ def compile_path(iterations=3): torch.cuda.synchronize() times.append(start.elapsed_time(end)) - print("-----compile_path-----> compilation time:", times, "milliseconds") + print("-----compile_path-----> compilation time:\n", times, "milliseconds") if __name__ == "__main__": diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py index f491bc2523..f9b6f075eb 100644 --- a/py/torch_tensorrt/dynamo/_engine_caching.py +++ b/py/torch_tensorrt/dynamo/_engine_caching.py @@ -1,8 +1,9 @@ -import ast import copy import logging import os +import pickle import shutil +import sys from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Tuple, cast @@ -50,6 +51,7 @@ def save( serialized_engine: bytes, input_names: List[str], output_names: List[str], + weight_name_map: Optional[Dict[str, Any]] = None, ) -> bool: """Save the serialized engine to hard disk @@ -58,6 +60,7 @@ def save( serialized_engine (bytes): serialized TRT engine input_names (List[str]): input names of TRT engine output_names (List[str]): output names of TRT engine + weight_name_map (Optional[Dict[str, Any]]): weight name map for refitting Returns: bool: whether the serialized engine is saved successfully @@ -65,14 +68,16 @@ def save( pass @abstractmethod - def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]: + def load( + self, hash: str + ) -> Tuple[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]: """Load the serialized engine from hard disk Args: hash (str): hash value of the GraphModule Returns: - Sequence[Optional[bytes], List[str], List[str]]: serialized TRT engine, input names of TRT Engine, output names of TRT Engine + Sequence[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]: serialized engine, input names, output names, weight name map """ pass @@ -89,16 +94,16 @@ def __init__( self.engine_cache_dir = engine_cache_dir self.hash2size_map: Dict[str, int] = {} - def has_available_cache_size(self, serialized_engine: bytes) -> bool: + def has_available_cache_size(self, needed_size: int) -> bool: """Check if the cache has available space for saving the serialized engine Args: - serialized_engine (bytes): serialized TRT engine + needed_size (int): needed size for erialized TRT engine and/or weight_name_map Returns: bool: whether the cache has available size for the serialized engine """ - return int(serialized_engine.nbytes) <= self.available_engine_cache_size + return needed_size <= self.available_engine_cache_size def clear_cache(self, needed_min_size: int) -> bool: """Clear the cache to make sure at least `needed_min_size` bytes are available, if possible @@ -154,36 +159,75 @@ def save( serialized_engine: bytes, input_names: List[str], output_names: List[str], + weight_name_map: Optional[Dict[str, Any]] = None, ) -> bool: serialized_engine_size = int(serialized_engine.nbytes) + if weight_name_map is not None: + serialized_engine_size += sum( + sys.getsizeof(v) for v in weight_name_map.values() + ) if serialized_engine_size > self.total_engine_cache_size: _LOGGER.warning( f"The serialized engine cannot be saved because the size of the engine {serialized_engine_size} is larger than the total cache size {self.total_engine_cache_size}." ) return False - # Check if there is enough available cache size for the serialized engine - if not self.has_available_cache_size(serialized_engine): + # Check if there is enough available cache size for the serialized engine and/or weight_name_map + if not self.has_available_cache_size(serialized_engine_size): self.clear_cache(serialized_engine_size) # Save the serialized engine to the cache directory - if self.has_available_cache_size(serialized_engine): - path = os.path.join( - self.engine_cache_dir, - f"{hash}/engine--{input_names}--{output_names}.trt", + if self.has_available_cache_size(serialized_engine_size): + self.hash2size_map[hash] = serialized_engine_size + self.available_engine_cache_size -= serialized_engine_size + directory = os.path.join(self.engine_cache_dir, hash) + + engine_path = os.path.join( + directory, + "engine.trt", + ) + io_names_path = os.path.join( + directory, + "io_names.pkl", ) try: - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, "wb") as f: + os.makedirs(os.path.dirname(engine_path), exist_ok=True) + with open(engine_path, "wb") as f: f.write(serialized_engine) - self.hash2size_map[hash] = serialized_engine_size - self.available_engine_cache_size -= serialized_engine_size - _LOGGER.info(f"A TRT engine was cached to {path}") - + os.makedirs(os.path.dirname(io_names_path), exist_ok=True) + with open(io_names_path, "wb") as f: + pickle.dump( + {"input_names": input_names, "output_names": output_names}, f + ) + _LOGGER.info(f"The TRT engine was saved to {engine_path}") except Exception as e: - _LOGGER.warning(f"Failed to save the TRT engine to {path}: {e}") + del self.hash2size_map[hash] + self.available_engine_cache_size += serialized_engine_size + shutil.rmtree(directory) + _LOGGER.warning(f"Failed to save the TRT engine to {engine_path}: {e}") return False + if weight_name_map is not None: + weight_name_map_path = os.path.join( + directory, + "weight_name_map.pkl", + ) + try: + os.makedirs(os.path.dirname(weight_name_map_path), exist_ok=True) + with open(weight_name_map_path, "wb") as f: + pickle.dump(weight_name_map, f) + _LOGGER.info( + f"The weight_name_map was saved to {weight_name_map_path}" + ) + except Exception as e: + del self.hash2size_map[hash] + self.available_engine_cache_size += serialized_engine_size + shutil.rmtree(directory) + _LOGGER.warning( + f"Failed to save the weight_name_map to {weight_name_map_path}: {e}" + ) + return False + return True else: @@ -192,21 +236,33 @@ def save( ) return False - def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]: + def load( + self, hash: str + ) -> Tuple[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]: directory = os.path.join(self.engine_cache_dir, hash) if os.path.exists(directory): - engine_list = os.listdir(directory) - assert ( - len(engine_list) == 1 - ), f"There are more than one engine {engine_list} under {directory}." - path = os.path.join(directory, engine_list[0]) - input_names_str, output_names_str = ( - engine_list[0].split(".trt")[0].split("--")[1:] - ) - input_names = ast.literal_eval(input_names_str) - output_names = ast.literal_eval(output_names_str) - with open(path, "rb") as f: - serialized_engine = f.read() - return serialized_engine, input_names, output_names + # load engine + serialized_engine = None + engine_path = os.path.join(directory, "engine.trt") + if os.path.exists(engine_path): + with open(engine_path, "rb") as f: + serialized_engine = f.read() + + input_names = [] + output_names = [] + io_names_path = os.path.join(directory, "io_names.pkl") + if os.path.exists(io_names_path): + with open(io_names_path, "rb") as f: + io_names = pickle.load(f) + input_names = io_names["input_names"] + output_names = io_names["output_names"] + + # load weight_name_map + weight_name_map = None + weight_name_map_path = os.path.join(directory, "weight_name_map.pkl") + if os.path.exists(weight_name_map_path): + with open(weight_name_map_path, "rb") as f: + weight_name_map = pickle.load(f) + return serialized_engine, input_names, output_names, weight_name_map else: - return None, [], [] + return None, [], [], {} diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index ccb602e8dc..cffcea5b9f 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -530,28 +530,21 @@ def run( if self.compilation_settings.load_engine_cache: # query the cached TRT engine - serialized_engine, input_names, output_names = engine_cache.load(hash_val) + serialized_engine, input_names, output_names, weight_name_map = ( + engine_cache.load(hash_val) + ) if serialized_engine is not None: self._input_names = input_names self._output_names = output_names + self.weight_name_map = weight_name_map _LOGGER.info( "Hit the cached TRT engine. It is loaded for skipping recompilation." ) - - # refit the engine - from torch_tensorrt.dynamo._refit import ( - _refit_single_trt_engine_with_gm, - ) - - runtime = trt.Runtime(TRT_LOGGER) - engine = runtime.deserialize_cuda_engine(serialized_engine) - _refit_single_trt_engine_with_gm( - self.module, engine, self.input_specs, self.compilation_settings - ) - _LOGGER.info("Refitting Succeed!") - return TRTInterpreterResult( - serialized_engine, self._input_names, self._output_names + serialized_engine, + self._input_names, + self._output_names, + self.weight_name_map, ) self._construct_trt_network_def() @@ -585,7 +578,11 @@ def run( ) if self.compilation_settings.save_engine_cache: engine_cache.save( - hash_val, serialized_engine, self._input_names, self._output_names + hash_val, + serialized_engine, + self._input_names, + self._output_names, + self.weight_name_map, ) with io.BytesIO() as engine_bytes: From a7b7676c3e6ba35c8c10c6800db601b2767285ff Mon Sep 17 00:00:00 2001 From: Evan Li Date: Tue, 20 Aug 2024 22:12:01 -0700 Subject: [PATCH 06/14] refactor --- .../dynamo/engine_caching_bert_example.py | 13 +- examples/dynamo/engine_caching_example.py | 89 +++---- py/torch_tensorrt/dynamo/_compiler.py | 47 +--- py/torch_tensorrt/dynamo/_defaults.py | 9 +- py/torch_tensorrt/dynamo/_engine_caching.py | 245 ++++++++---------- py/torch_tensorrt/dynamo/_settings.py | 24 +- .../dynamo/conversion/_TRTInterpreter.py | 62 +++-- py/torch_tensorrt/dynamo/utils.py | 16 ++ 8 files changed, 227 insertions(+), 278 deletions(-) diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py index 2f133f5e8f..43cfc5f15a 100644 --- a/examples/dynamo/engine_caching_bert_example.py +++ b/examples/dynamo/engine_caching_bert_example.py @@ -29,11 +29,11 @@ def compile_bert(iterations=3): torch._dynamo.reset() if i == 0: - save_engine_cache = False - load_engine_cache = False + cache_built_engines = False + reuse_cached_engines = False else: - save_engine_cache = True - load_engine_cache = True + cache_built_engines = True + reuse_cached_engines = True start.record() compilation_kwargs = { @@ -43,8 +43,9 @@ def compile_bert(iterations=3): "debug": False, "min_block_size": 1, "make_refitable": True, - "save_engine_cache": save_engine_cache, - "load_engine_cache": load_engine_cache, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + "engine_cache_dir": "/tmp/torch_trt_bert_engine_cache", "engine_cache_size": 1 << 30, # 1GB } optimized_model = torch.compile( diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py index 80cf696466..89912e74b0 100644 --- a/examples/dynamo/engine_caching_example.py +++ b/examples/dynamo/engine_caching_example.py @@ -1,7 +1,5 @@ -import ast -import logging import os -from typing import List, Optional, Tuple +from typing import Optional import numpy as np import torch @@ -10,9 +8,6 @@ from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH from torch_tensorrt.dynamo._engine_caching import BaseEngineCache -_LOGGER: logging.Logger = logging.getLogger(__name__) - - np.random.seed(0) torch.manual_seed(0) size = (100, 3, 224, 224) @@ -49,11 +44,11 @@ def dynamo_path(iterations=3): inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")] remove_timing_cache() # remove timing cache for engine caching messurement if i == 0: - save_engine_cache = False - load_engine_cache = False + cache_built_engines = False + reuse_cached_engines = False else: - save_engine_cache = True - load_engine_cache = True + cache_built_engines = True + reuse_cached_engines = True start.record() trt_gm = torch_trt.dynamo.compile( @@ -64,8 +59,8 @@ def dynamo_path(iterations=3): debug=debug, min_block_size=min_block_size, make_refitable=True, - save_engine_cache=save_engine_cache, - load_engine_cache=load_engine_cache, + cache_built_engines=cache_built_engines, + reuse_cached_engines=reuse_cached_engines, engine_cache_size=1 << 30, # 1GB ) end.record() @@ -79,60 +74,36 @@ def dynamo_path(iterations=3): class MyEngineCache(BaseEngineCache): def __init__( self, - engine_cache_size: int, engine_cache_dir: str, ) -> None: - self.total_engine_cache_size = engine_cache_size - self.available_engine_cache_size = engine_cache_size self.engine_cache_dir = engine_cache_dir def save( self, hash: str, - serialized_engine: bytes, - input_names: List[str], - output_names: List[str], - ) -> bool: + blob: bytes, + prefix: str = "blob", + ): path = os.path.join( self.engine_cache_dir, - f"{hash}/engine--{input_names}--{output_names}.trt", + f"{prefix}_{hash}.bin", ) - try: - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, "wb") as f: - f.write(serialized_engine) - except Exception as e: - _LOGGER.warning(f"Failed to save the TRT engine to {path}: {e}") - return False - - _LOGGER.info(f"A TRT engine was cached to {path}") - serialized_engine_size = int(serialized_engine.nbytes) - self.available_engine_cache_size -= serialized_engine_size - return True - - def load(self, hash: str) -> Tuple[Optional[bytes], List[str], List[str]]: - directory = os.path.join(self.engine_cache_dir, hash) - if os.path.exists(directory): - engine_list = os.listdir(directory) - assert ( - len(engine_list) == 1 - ), f"There are more than one engine {engine_list} under {directory}." - path = os.path.join(directory, engine_list[0]) - input_names_str, output_names_str = ( - engine_list[0].split(".trt")[0].split("--")[1:] - ) - input_names = ast.literal_eval(input_names_str) - output_names = ast.literal_eval(output_names_str) + os.makedirs(path, exist_ok=True) + with open(path, "wb") as f: + f.write(blob) + + def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]: + path = os.path.join(self.engine_cache_dir, f"{prefix}_{hash}.bin") + if os.path.exists(path): with open(path, "rb") as f: - serialized_engine = f.read() - return serialized_engine, input_names, output_names - else: - return None, [], [] + blob = f.read() + return blob + return None def compile_path(iterations=3): times = [] - engine_cache = MyEngineCache(200 * (1 << 20), "/tmp/your_dir") + engine_cache = MyEngineCache("/tmp/your_dir") start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) @@ -147,11 +118,11 @@ def compile_path(iterations=3): torch._dynamo.reset() if i == 0: - save_engine_cache = False - load_engine_cache = False + cache_built_engines = False + reuse_cached_engines = False else: - save_engine_cache = True - load_engine_cache = True + cache_built_engines = True + reuse_cached_engines = True start.record() compiled_model = torch.compile( @@ -163,9 +134,9 @@ def compile_path(iterations=3): "debug": debug, "min_block_size": min_block_size, "make_refitable": True, - "save_engine_cache": save_engine_cache, - "load_engine_cache": load_engine_cache, - "engine_cache_instance": engine_cache, # use custom engine cache + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + "custom_engine_cache": engine_cache, # use custom engine cache }, ) compiled_model(*inputs) # trigger the compilation @@ -178,4 +149,4 @@ def compile_path(iterations=3): if __name__ == "__main__": dynamo_path() - compile_path() + # compile_path() diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 229ecb5ef7..bc31592c06 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -18,7 +18,7 @@ dryrun_stats_display, parse_non_trt_nodes, ) -from torch_tensorrt.dynamo._engine_caching import BaseEngineCache, EngineCache +from torch_tensorrt.dynamo._engine_caching import BaseEngineCache, DiskEngineCache from torch_tensorrt.dynamo.conversion import ( CompilationSettings, UnsupportedOperatorException, @@ -83,11 +83,11 @@ def compile( hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE, timing_cache_path: str = _defaults.TIMING_CACHE_PATH, lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT, - save_engine_cache: bool = _defaults.SAVE_ENGINE_CACHE, - load_engine_cache: bool = _defaults.LOAD_ENGINE_CACHE, + cache_built_engines: bool = _defaults.CACHE_BUILT_ENGINES, + reuse_cached_engines: bool = _defaults.REUSE_CACHED_ENGINES, engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR, engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE, - engine_cache_instance: Optional[BaseEngineCache] = None, + custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT @@ -153,11 +153,11 @@ def compile( hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime. - save_engine_cache (bool): Whether to save the compiled TRT engines to hard disk - load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk + cache_built_engines (bool): Whether to save the compiled TRT engines to storage + reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage engine_cache_dir (str): Directory to store the cached TRT engines engine_cache_size (int): Maximum hard-disk space to use for the engine cache - engine_cache_instance (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache + custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -235,10 +235,9 @@ def compile( gm = post_lowering(gm) logger.debug("Lowered Input graph: " + str(gm.graph)) - if engine_cache_instance is None: - engine_cache_instance = EngineCacheInstanceCreator.get_creator( - engine_cache_size, engine_cache_dir - ).engine_cache_instance + if cache_built_engines or reuse_cached_engines: + if custom_engine_cache is None: + custom_engine_cache = DiskEngineCache(engine_cache_dir, engine_cache_size) compilation_options = { "enabled_precisions": ( @@ -273,11 +272,9 @@ def compile( "hardware_compatible": hardware_compatible, "timing_cache_path": timing_cache_path, "lazy_engine_init": lazy_engine_init, - "save_engine_cache": save_engine_cache, - "load_engine_cache": load_engine_cache, - "engine_cache_dir": engine_cache_dir, - "engine_cache_size": engine_cache_size, - "engine_cache_instance": engine_cache_instance, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + "custom_engine_cache": custom_engine_cache, } settings = CompilationSettings(**compilation_options) @@ -686,21 +683,3 @@ def convert_exported_program_to_serialized_trt_engine( serialized_engine: bytes = interpreter_result.serialized_engine return serialized_engine - - -class EngineCacheInstanceCreator: - engine_cache_creator = None - - def __init__(self, engine_cache_size: int, engine_cache_dir: str) -> None: - self.engine_cache_instance = EngineCache( - engine_cache_size=engine_cache_size, - engine_cache_dir=engine_cache_dir, - ) - - @classmethod - def get_creator( - cls, engine_cache_size: int, engine_cache_dir: str - ) -> EngineCacheInstanceCreator: - if cls.engine_cache_creator is None: - cls.engine_cache_creator = cls(engine_cache_size, engine_cache_dir) - return cls.engine_cache_creator diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index e90f3f8c2a..83e85cb3c7 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -4,7 +4,6 @@ import torch from torch_tensorrt._Device import Device from torch_tensorrt._enums import EngineCapability, dtype -from torch_tensorrt.dynamo._engine_caching import EngineCache ENABLED_PRECISIONS = {dtype.f32} DEBUG = False @@ -36,13 +35,11 @@ tempfile.gettempdir(), "torch_tensorrt_engine_cache", "timing_cache.bin" ) LAZY_ENGINE_INIT = False -SAVE_ENGINE_CACHE = True -LOAD_ENGINE_CACHE = True +CACHE_BUILT_ENGINES = True +REUSE_CACHED_ENGINES = True ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache") ENGINE_CACHE_SIZE = 1073741824 -ENGINE_CACHE_INSTANCE = EngineCache( - engine_cache_size=ENGINE_CACHE_SIZE, engine_cache_dir=ENGINE_CACHE_DIR -) +CUSTOM_ENGINE_CACHE = None def default_device() -> Device: diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py index f9b6f075eb..01220233ea 100644 --- a/py/torch_tensorrt/dynamo/_engine_caching.py +++ b/py/torch_tensorrt/dynamo/_engine_caching.py @@ -3,7 +3,6 @@ import os import pickle import shutil -import sys from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Tuple, cast @@ -44,78 +43,126 @@ def get_hash(gm: torch.fx.GraphModule) -> str: return hash_val - @abstractmethod - def save( - self, - hash: str, + @staticmethod + def pack( serialized_engine: bytes, input_names: List[str], output_names: List[str], - weight_name_map: Optional[Dict[str, Any]] = None, - ) -> bool: - """Save the serialized engine to hard disk + weight_name_map: Optional[Dict[str, Any]], + ) -> bytes: + """Pack serialized engine, input names, output names, and weight map into a single blob Args: - hash (str): hash value of the GraphModule serialized_engine (bytes): serialized TRT engine input_names (List[str]): input names of TRT engine output_names (List[str]): output names of TRT engine weight_name_map (Optional[Dict[str, Any]]): weight name map for refitting Returns: - bool: whether the serialized engine is saved successfully + bytes: packed blob + """ + return pickle.dumps( + { + "serialized_engine": bytes(serialized_engine), + "input_names": input_names, + "output_names": output_names, + "weight_name_map": weight_name_map, + } + ) + + @staticmethod + def unpack( + packed_obj: bytes, + ) -> Tuple[bytes, List[str], List[str], Optional[Dict[str, Any]]]: + """Unpack packed blob into serialized engine, input names, output names, and weight map + + Args: + packed_obj (bytes): packed blob + + Returns: + Tuple[bytes, List[str], List[str], Optional[Dict[str, Any]]]: serialized engine, input names, output names, weight name map + """ + unpacked = pickle.loads(packed_obj) + return ( + unpacked["serialized_engine"], + unpacked["input_names"], + unpacked["output_names"], + unpacked["weight_name_map"], + ) + + @abstractmethod + def save(self, hash: str, blob: bytes, *args: Any, **kwargs: Any) -> None: + """Store blob in cache + + Args: + hash (str): hash value of the GraphModule + blob (bytes): packed blob """ pass @abstractmethod - def load( - self, hash: str - ) -> Tuple[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]: - """Load the serialized engine from hard disk + def load(self, hash: str, *args: Any, **kwargs: Any) -> Optional[bytes]: + """Load blob from storage Args: hash (str): hash value of the GraphModule Returns: - Sequence[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]: serialized engine, input names, output names, weight name map + Optional[bytes]: blob or None if doesn't hit """ pass -class EngineCache(BaseEngineCache): +class DiskEngineCache(BaseEngineCache): + dir2hash2size_map: Dict[str, Dict[str, int]] = ( + {} + ) # dir2hash2size_map["engine_cache_dir"]["hash"] = size def __init__( self, - engine_cache_size: int, engine_cache_dir: str, + engine_cache_size: int, ) -> None: - self.total_engine_cache_size = engine_cache_size - self.available_engine_cache_size = engine_cache_size + + def get_dir_size(path: str) -> int: + total = 0 + with os.scandir(path) as it: + for entry in it: + if entry.is_file(): + total += entry.stat().st_size + elif entry.is_dir(): + total += get_dir_size(entry.path) + return total + + if not os.path.exists(engine_cache_dir): + os.makedirs(engine_cache_dir, exist_ok=True) self.engine_cache_dir = engine_cache_dir - self.hash2size_map: Dict[str, int] = {} + self.total_engine_cache_size = engine_cache_size + self.available_engine_cache_size = engine_cache_size - get_dir_size( + engine_cache_dir + ) + if engine_cache_dir not in DiskEngineCache.dir2hash2size_map: + DiskEngineCache.dir2hash2size_map[engine_cache_dir] = {} def has_available_cache_size(self, needed_size: int) -> bool: - """Check if the cache has available space for saving the serialized engine + """Check if the cache has available space for saving object Args: - needed_size (int): needed size for erialized TRT engine and/or weight_name_map + needed_size (int): needed size for saving object Returns: - bool: whether the cache has available size for the serialized engine + bool: whether the cache has available size for saving object """ return needed_size <= self.available_engine_cache_size - def clear_cache(self, needed_min_size: int) -> bool: + def clear_cache(self, needed_min_size: int) -> None: """Clear the cache to make sure at least `needed_min_size` bytes are available, if possible Args: needed_min_size (int): the minimum needed size - - Returns: - bool: whether the cache is cleared successfully """ - def LRU() -> bool: + def LRU() -> None: """Clear the Least Recently Used engine in the cache""" # Get the list of engine directories engines_hash_values = os.listdir(self.engine_cache_dir) @@ -132,8 +179,10 @@ def LRU() -> bool: # Remove the entire directory shutil.rmtree(engine_path) # Update the available cache size - self.available_engine_cache_size += self.hash2size_map.pop( - engine_hash, 0 + self.available_engine_cache_size += ( + DiskEngineCache.dir2hash2size_map[self.engine_cache_dir].pop( + engine_hash, 0 + ) ) _LOGGER.info( f"Removed the engine cache at {engine_path}, available cache size: {self.available_engine_cache_size} bytes." @@ -142,127 +191,61 @@ def LRU() -> bool: _LOGGER.warning( f"Failed to clear the engine cache at {engine_path}: {e}" ) - return False - return True - if not os.path.exists(self.engine_cache_dir): - return False - - _LOGGER.info( - f"Total cache size: {self.total_engine_cache_size} bytes; available cache size: {self.available_engine_cache_size} bytes. Clearing the cache to make sure at least {needed_min_size} bytes are available." - ) - return LRU() + if needed_min_size > self.total_engine_cache_size: + _LOGGER.warning( + f"The needed minimum size {needed_min_size} is larger than the total cache size {self.total_engine_cache_size}. Nothing will be cleared." + ) + else: + LRU() def save( self, hash: str, - serialized_engine: bytes, - input_names: List[str], - output_names: List[str], - weight_name_map: Optional[Dict[str, Any]] = None, - ) -> bool: - serialized_engine_size = int(serialized_engine.nbytes) - if weight_name_map is not None: - serialized_engine_size += sum( - sys.getsizeof(v) for v in weight_name_map.values() - ) - if serialized_engine_size > self.total_engine_cache_size: + blob: bytes, + ) -> None: + blob_size = len(blob) + if blob_size > self.total_engine_cache_size: _LOGGER.warning( - f"The serialized engine cannot be saved because the size of the engine {serialized_engine_size} is larger than the total cache size {self.total_engine_cache_size}." + f"The serialized engine cannot be saved because the size {blob_size} is larger than the total cache size {self.total_engine_cache_size}." ) - return False + return - # Check if there is enough available cache size for the serialized engine and/or weight_name_map - if not self.has_available_cache_size(serialized_engine_size): - self.clear_cache(serialized_engine_size) + if not self.has_available_cache_size(blob_size): + self.clear_cache(blob_size) - # Save the serialized engine to the cache directory - if self.has_available_cache_size(serialized_engine_size): - self.hash2size_map[hash] = serialized_engine_size - self.available_engine_cache_size -= serialized_engine_size + if self.has_available_cache_size(blob_size): + DiskEngineCache.dir2hash2size_map[self.engine_cache_dir][hash] = blob_size + self.available_engine_cache_size -= blob_size directory = os.path.join(self.engine_cache_dir, hash) + if not os.path.exists(directory): + os.makedirs(directory, exist_ok=True) - engine_path = os.path.join( - directory, - "engine.trt", - ) - io_names_path = os.path.join( + blob_path = os.path.join( directory, - "io_names.pkl", + "blob.bin", ) try: - os.makedirs(os.path.dirname(engine_path), exist_ok=True) - with open(engine_path, "wb") as f: - f.write(serialized_engine) - os.makedirs(os.path.dirname(io_names_path), exist_ok=True) - with open(io_names_path, "wb") as f: - pickle.dump( - {"input_names": input_names, "output_names": output_names}, f - ) - _LOGGER.info(f"The TRT engine was saved to {engine_path}") + with open(blob_path, "wb") as f: + f.write(blob) + _LOGGER.info(f"The blob was saved to {blob_path}") except Exception as e: - del self.hash2size_map[hash] - self.available_engine_cache_size += serialized_engine_size + del DiskEngineCache.dir2hash2size_map[self.engine_cache_dir][hash] + self.available_engine_cache_size += blob_size shutil.rmtree(directory) - _LOGGER.warning(f"Failed to save the TRT engine to {engine_path}: {e}") - return False - - if weight_name_map is not None: - weight_name_map_path = os.path.join( - directory, - "weight_name_map.pkl", - ) - try: - os.makedirs(os.path.dirname(weight_name_map_path), exist_ok=True) - with open(weight_name_map_path, "wb") as f: - pickle.dump(weight_name_map, f) - _LOGGER.info( - f"The weight_name_map was saved to {weight_name_map_path}" - ) - except Exception as e: - del self.hash2size_map[hash] - self.available_engine_cache_size += serialized_engine_size - shutil.rmtree(directory) - _LOGGER.warning( - f"Failed to save the weight_name_map to {weight_name_map_path}: {e}" - ) - return False - - return True + _LOGGER.warning(f"Failed to save the blob to {blob_path}: {e}") else: _LOGGER.warning( - f"The serialized engine {serialized_engine_size} is still larger than the available cache size {self.available_engine_cache_size}." + f"The size {blob_size} is still larger than the available cache size {self.available_engine_cache_size}." ) - return False - def load( - self, hash: str - ) -> Tuple[Optional[bytes], List[str], List[str], Optional[Dict[str, Any]]]: + def load(self, hash: str) -> Optional[bytes]: directory = os.path.join(self.engine_cache_dir, hash) if os.path.exists(directory): - # load engine - serialized_engine = None - engine_path = os.path.join(directory, "engine.trt") - if os.path.exists(engine_path): - with open(engine_path, "rb") as f: - serialized_engine = f.read() - - input_names = [] - output_names = [] - io_names_path = os.path.join(directory, "io_names.pkl") - if os.path.exists(io_names_path): - with open(io_names_path, "rb") as f: - io_names = pickle.load(f) - input_names = io_names["input_names"] - output_names = io_names["output_names"] - - # load weight_name_map - weight_name_map = None - weight_name_map_path = os.path.join(directory, "weight_name_map.pkl") - if os.path.exists(weight_name_map_path): - with open(weight_name_map_path, "rb") as f: - weight_name_map = pickle.load(f) - return serialized_engine, input_names, output_names, weight_name_map - else: - return None, [], [], {} + blob_path = os.path.join(directory, "blob.bin") + if os.path.exists(blob_path): + with open(blob_path, "rb") as f: + blob = f.read() + return blob + return None diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 90c17d03c3..0327727c9f 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -6,6 +6,8 @@ from torch_tensorrt._enums import EngineCapability, dtype from torch_tensorrt.dynamo._defaults import ( ASSUME_DYNAMIC_SHAPE_SUPPORT, + CACHE_BUILT_ENGINES, + CUSTOM_ENGINE_CACHE, DEBUG, DISABLE_TF32, DLA_GLOBAL_DRAM_SIZE, @@ -14,13 +16,9 @@ DRYRUN, ENABLE_EXPERIMENTAL_DECOMPOSITIONS, ENABLED_PRECISIONS, - ENGINE_CACHE_DIR, - ENGINE_CACHE_INSTANCE, - ENGINE_CACHE_SIZE, ENGINE_CAPABILITY, HARDWARE_COMPATIBLE, LAZY_ENGINE_INIT, - LOAD_ENGINE_CACHE, MAKE_REFITABLE, MAX_AUX_STREAMS, MIN_BLOCK_SIZE, @@ -28,7 +26,7 @@ OPTIMIZATION_LEVEL, PASS_THROUGH_BUILD_FAILURES, REQUIRE_FULL_COMPILATION, - SAVE_ENGINE_CACHE, + REUSE_CACHED_ENGINES, SPARSE_WEIGHTS, TIMING_CACHE_PATH, TRUNCATE_DOUBLE, @@ -80,11 +78,9 @@ class CompilationSettings: output to a file if a string path is specified hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation - save_engine_cache (bool): Whether to save the compiled TRT engines to hard disk - load_engine_cache (bool): Whether to load the compiled TRT engines from hard disk - engine_cache_dir (str): Directory to store the cached TRT engines - engine_cache_size (int): Maximum hard-disk space to use for the engine cache - engine_cache_instance (BaseEngineCache): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache + cache_built_engines (bool): Whether to save the compiled TRT engines to storage + reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage + custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache """ enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS) @@ -117,8 +113,6 @@ class CompilationSettings: hardware_compatible: bool = HARDWARE_COMPATIBLE timing_cache_path: str = TIMING_CACHE_PATH lazy_engine_init: bool = LAZY_ENGINE_INIT - save_engine_cache: bool = SAVE_ENGINE_CACHE - load_engine_cache: bool = LOAD_ENGINE_CACHE - engine_cache_dir: str = ENGINE_CACHE_DIR - engine_cache_size: int = ENGINE_CACHE_SIZE - engine_cache_instance: BaseEngineCache = ENGINE_CACHE_INSTANCE + cache_built_engines: bool = CACHE_BUILT_ENGINES + reuse_cached_engines: bool = REUSE_CACHED_ENGINES + custom_engine_cache: Optional[BaseEngineCache] = CUSTOM_ENGINE_CACHE diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index cffcea5b9f..a422f7dde4 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -522,30 +522,35 @@ def run( TRTInterpreterResult """ if ( - self.compilation_settings.save_engine_cache - or self.compilation_settings.load_engine_cache - ): - engine_cache = self.compilation_settings.engine_cache_instance - hash_val = engine_cache.get_hash(self.module) - - if self.compilation_settings.load_engine_cache: - # query the cached TRT engine - serialized_engine, input_names, output_names, weight_name_map = ( - engine_cache.load(hash_val) - ) - if serialized_engine is not None: - self._input_names = input_names - self._output_names = output_names - self.weight_name_map = weight_name_map - _LOGGER.info( - "Hit the cached TRT engine. It is loaded for skipping recompilation." - ) - return TRTInterpreterResult( - serialized_engine, - self._input_names, - self._output_names, - self.weight_name_map, - ) + self.compilation_settings.custom_engine_cache is not None + ): # custom_engine_cache could be None if this function is called from convert_exported_program_to_serialized_trt_engine etc. + if ( + self.compilation_settings.cache_built_engines + or self.compilation_settings.reuse_cached_engines + ): + engine_cache = self.compilation_settings.custom_engine_cache + hash_val = engine_cache.get_hash(self.module) + + if self.compilation_settings.reuse_cached_engines: + # query the cached TRT engine + blob = engine_cache.load(hash_val) + if blob is not None: # hit the cache + serialized_engine, input_names, output_names, weight_name_map = ( + engine_cache.unpack(blob) + ) + self._input_names = input_names + self._output_names = output_names + self.weight_name_map = weight_name_map + _LOGGER.info( + "Hit the cached TRT engine. It is loaded and skip recompilation." + ) + # TODO: refit the engine here or outside (within convert_module)? + return TRTInterpreterResult( + serialized_engine, + self._input_names, + self._output_names, + self.weight_name_map, + ) self._construct_trt_network_def() @@ -576,14 +581,17 @@ def run( self._save_timing_cache( builder_config, self.compilation_settings.timing_cache_path ) - if self.compilation_settings.save_engine_cache: - engine_cache.save( - hash_val, + if ( + self.compilation_settings.custom_engine_cache is not None + and self.compilation_settings.cache_built_engines + ): + blob = engine_cache.pack( serialized_engine, self._input_names, self._output_names, self.weight_name_map, ) + engine_cache.save(hash_val, blob) with io.BytesIO() as engine_bytes: engine_bytes.write(serialized_engine) diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index dfd22e7f9f..3945e976d6 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -495,6 +495,22 @@ def parse_dynamo_kwargs(kwargs: Any) -> CompilationSettings: ) settings.require_full_compilation = False + # If cache_built_engines and reuse_cached_engines are True but custom_engine_cache is not provided, + # then create a default disk engine cache + if kwargs.get("cache_built_engines") or kwargs.get("reuse_cached_engines"): + if settings.custom_engine_cache is None: + from torch_tensorrt.dynamo._engine_caching import DiskEngineCache + + engine_cache_dir = kwargs.get( + "engine_cache_dir", _defaults.ENGINE_CACHE_DIR + ) + engine_cache_size = kwargs.get( + "engine_cache_size", _defaults.ENGINE_CACHE_SIZE + ) + settings.custom_engine_cache = DiskEngineCache( + engine_cache_dir, engine_cache_size + ) + logger.info("Compilation Settings: %s\n", settings) return settings From 64885deb25651b420fc6171252e87e921a118218 Mon Sep 17 00:00:00 2001 From: Evan Li Date: Wed, 21 Aug 2024 18:01:22 -0700 Subject: [PATCH 07/14] small fixes --- py/torch_tensorrt/dynamo/_engine_caching.py | 4 ++-- py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py index 01220233ea..ee5a6ec854 100644 --- a/py/torch_tensorrt/dynamo/_engine_caching.py +++ b/py/torch_tensorrt/dynamo/_engine_caching.py @@ -8,7 +8,7 @@ import torch from torch._inductor.codecache import FxGraphCachePickler -from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode +from torch.fx.experimental.proxy_tensor import unset_fake_temporarily _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -34,7 +34,7 @@ def get_hash(gm: torch.fx.GraphModule) -> str: str: hash value of the GraphModule """ # parameters are set to 0 - with maybe_disable_fake_tensor_mode(): + with unset_fake_temporarily(): new_gm = copy.deepcopy(gm) for name, param in new_gm.named_parameters(): param.data.zero_() diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index a422f7dde4..976c943e0d 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -18,6 +18,7 @@ ) import numpy as np +import tensorrt as trt import torch import torch.fx from torch.fx.node import _get_qualified_name @@ -42,7 +43,6 @@ from torch_tensorrt.fx.observer import Observer from torch_tensorrt.logging import TRT_LOGGER -import tensorrt as trt from packaging import version _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -542,7 +542,7 @@ def run( self._output_names = output_names self.weight_name_map = weight_name_map _LOGGER.info( - "Hit the cached TRT engine. It is loaded and skip recompilation." + "Found the cached engine that corresponds to this graph. It is directly loaded." ) # TODO: refit the engine here or outside (within convert_module)? return TRTInterpreterResult( From 88d1a2507dad0e0a1e417f7ba418c5eebad834ef Mon Sep 17 00:00:00 2001 From: Evan Li Date: Fri, 23 Aug 2024 17:52:39 -0700 Subject: [PATCH 08/14] move refit into interpret_module_to_result --- py/torch_tensorrt/dynamo/_engine_caching.py | 6 +- .../dynamo/conversion/_TRTInterpreter.py | 1 - .../dynamo/conversion/_conversion.py | 57 ++++++++++--------- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_engine_caching.py b/py/torch_tensorrt/dynamo/_engine_caching.py index ee5a6ec854..c8ff7aba50 100644 --- a/py/torch_tensorrt/dynamo/_engine_caching.py +++ b/py/torch_tensorrt/dynamo/_engine_caching.py @@ -48,7 +48,7 @@ def pack( serialized_engine: bytes, input_names: List[str], output_names: List[str], - weight_name_map: Optional[Dict[str, Any]], + weight_name_map: Optional[Dict[Any, Any]], ) -> bytes: """Pack serialized engine, input names, output names, and weight map into a single blob @@ -56,7 +56,7 @@ def pack( serialized_engine (bytes): serialized TRT engine input_names (List[str]): input names of TRT engine output_names (List[str]): output names of TRT engine - weight_name_map (Optional[Dict[str, Any]]): weight name map for refitting + weight_name_map (Optional[Dict[Any, Any]]): weight name map for refitting Returns: bytes: packed blob @@ -73,7 +73,7 @@ def pack( @staticmethod def unpack( packed_obj: bytes, - ) -> Tuple[bytes, List[str], List[str], Optional[Dict[str, Any]]]: + ) -> Tuple[bytes, List[str], List[str], Optional[Dict[Any, Any]]]: """Unpack packed blob into serialized engine, input names, output names, and weight map Args: diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 976c943e0d..d10a5bca38 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -544,7 +544,6 @@ def run( _LOGGER.info( "Found the cached engine that corresponds to this graph. It is directly loaded." ) - # TODO: refit the engine here or outside (within convert_module)? return TRTInterpreterResult( serialized_engine, self._input_names, diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index e0643cf996..03bf14dfc6 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -114,6 +114,36 @@ def interpret_module_to_result( ) interpreter_result = interpreter.run() + + if settings.make_refitable: + # Run fast refit even if it's the first compilation. + # This is to ensure that the weight name map is correct for future refits. + # If the fast refit fails, remove the weight name map. + from torch_tensorrt.dynamo._refit import _refit_single_trt_engine_with_gm + from torch_tensorrt.logging import TRT_LOGGER + + runtime = trt.Runtime(TRT_LOGGER) + refit_test_engine = runtime.deserialize_cuda_engine( + interpreter_result.serialized_engine + ) + try: + _refit_single_trt_engine_with_gm( + new_gm=module, + old_engine=refit_test_engine, + input_list=inputs, + settings=settings, + weight_name_map=interpreter_result.weight_name_map, + ) + except AssertionError: + # TRTInterpreterResult is a tuple, so we need to create a new one + interpreter_result = TRTInterpreterResult( + interpreter_result.serialized_engine, + interpreter_result.input_names, + interpreter_result.output_names, + None, + ) + logger.warning("Fast refit test failed. Removing the weight map caching.") + return interpreter_result @@ -133,31 +163,6 @@ def convert_module( PythonTorchTensorRTModule or TorchTensorRTModule """ interpreter_result = interpret_module_to_result(module, inputs, settings) - # Test fast refit: - from torch_tensorrt.dynamo._refit import _refit_single_trt_engine_with_gm - from torch_tensorrt.logging import TRT_LOGGER - - weight_name_map: Any = None - # Do the test refit with cached map if make_refitable is enabled - if settings.make_refitable: - runtime = trt.Runtime(TRT_LOGGER) - refit_test_engine = runtime.deserialize_cuda_engine( - interpreter_result.serialized_engine - ) - try: - _refit_single_trt_engine_with_gm( - new_gm=module, - old_engine=refit_test_engine, - input_list=inputs, - settings=settings, - weight_name_map=interpreter_result.weight_name_map, - ) - weight_name_map = interpreter_result.weight_name_map - except AssertionError: - logger.warning("Fast refit test failed. Removing the weight map caching.") - - del refit_test_engine - torch.cuda.empty_cache() rt_cls = PythonTorchTensorRTModule @@ -181,5 +186,5 @@ def convert_module( output_binding_names=list(interpreter_result.output_names), name=name, settings=settings, - weight_name_map=weight_name_map, + weight_name_map=interpreter_result.weight_name_map, ) From 81eb7c53eeeb38f4833755615729df6baab3262c Mon Sep 17 00:00:00 2001 From: Evan Li Date: Tue, 27 Aug 2024 12:35:59 -0700 Subject: [PATCH 09/14] update refit usage --- py/torch_tensorrt/dynamo/_compiler.py | 3 ++ .../dynamo/conversion/_TRTInterpreter.py | 18 +++++++++++ .../dynamo/conversion/_conversion.py | 30 ------------------- py/torch_tensorrt/dynamo/utils.py | 3 ++ 4 files changed, 24 insertions(+), 30 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index bc31592c06..349243769c 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -236,6 +236,9 @@ def compile( logger.debug("Lowered Input graph: " + str(gm.graph)) if cache_built_engines or reuse_cached_engines: + assert ( + make_refitable + ), "Engine caching requires make_refitable to be set to True" if custom_engine_cache is None: custom_engine_cache = DiskEngineCache(engine_cache_dir, engine_cache_size) diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index d10a5bca38..60c5012f77 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -544,6 +544,24 @@ def run( _LOGGER.info( "Found the cached engine that corresponds to this graph. It is directly loaded." ) + + from torch_tensorrt.dynamo._refit import ( + _refit_single_trt_engine_with_gm, + ) + + runtime = trt.Runtime(TRT_LOGGER) + engine = runtime.deserialize_cuda_engine(serialized_engine) + + _refit_single_trt_engine_with_gm( + new_gm=self.module, + old_engine=engine, + input_list=self.input_specs, + settings=self.compilation_settings, + weight_name_map=weight_name_map, + ) + + serialized_engine = bytes(engine.serialize()) + return TRTInterpreterResult( serialized_engine, self._input_names, diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index 03bf14dfc6..36d7bd71e9 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -114,36 +114,6 @@ def interpret_module_to_result( ) interpreter_result = interpreter.run() - - if settings.make_refitable: - # Run fast refit even if it's the first compilation. - # This is to ensure that the weight name map is correct for future refits. - # If the fast refit fails, remove the weight name map. - from torch_tensorrt.dynamo._refit import _refit_single_trt_engine_with_gm - from torch_tensorrt.logging import TRT_LOGGER - - runtime = trt.Runtime(TRT_LOGGER) - refit_test_engine = runtime.deserialize_cuda_engine( - interpreter_result.serialized_engine - ) - try: - _refit_single_trt_engine_with_gm( - new_gm=module, - old_engine=refit_test_engine, - input_list=inputs, - settings=settings, - weight_name_map=interpreter_result.weight_name_map, - ) - except AssertionError: - # TRTInterpreterResult is a tuple, so we need to create a new one - interpreter_result = TRTInterpreterResult( - interpreter_result.serialized_engine, - interpreter_result.input_names, - interpreter_result.output_names, - None, - ) - logger.warning("Fast refit test failed. Removing the weight map caching.") - return interpreter_result diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 3945e976d6..460fd7a9f8 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -498,6 +498,9 @@ def parse_dynamo_kwargs(kwargs: Any) -> CompilationSettings: # If cache_built_engines and reuse_cached_engines are True but custom_engine_cache is not provided, # then create a default disk engine cache if kwargs.get("cache_built_engines") or kwargs.get("reuse_cached_engines"): + assert kwargs.get( + "make_refitable" + ), "Engine caching requires make_refitable to be set to True" if settings.custom_engine_cache is None: from torch_tensorrt.dynamo._engine_caching import DiskEngineCache From 04bb63a0475abfada83c1444dec65876433b5a6d Mon Sep 17 00:00:00 2001 From: Evan Li Date: Tue, 27 Aug 2024 19:52:12 -0700 Subject: [PATCH 10/14] force using slow refit, add unit tests --- examples/dynamo/engine_caching_example.py | 32 ++-- .../dynamo/conversion/_TRTInterpreter.py | 18 ++- tests/py/dynamo/models/test_engine_cache.py | 153 ++++++++++++++++++ 3 files changed, 185 insertions(+), 18 deletions(-) create mode 100644 tests/py/dynamo/models/test_engine_cache.py diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py index 89912e74b0..2d1018bb6e 100644 --- a/examples/dynamo/engine_caching_example.py +++ b/examples/dynamo/engine_caching_example.py @@ -10,7 +10,6 @@ np.random.seed(0) torch.manual_seed(0) -size = (100, 3, 224, 224) model = models.resnet18(pretrained=True).eval().to("cuda") enabled_precisions = {torch.float} @@ -24,7 +23,7 @@ def remove_timing_cache(path=TIMING_CACHE_PATH): os.remove(path) -def dynamo_path(iterations=3): +def dynamo_compile(iterations=3): times = [] start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) @@ -42,7 +41,7 @@ def dynamo_path(iterations=3): # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. for i in range(iterations): inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")] - remove_timing_cache() # remove timing cache for engine caching messurement + remove_timing_cache() # remove timing cache just for engine caching messurement if i == 0: cache_built_engines = False reuse_cached_engines = False @@ -63,11 +62,15 @@ def dynamo_path(iterations=3): reuse_cached_engines=reuse_cached_engines, engine_cache_size=1 << 30, # 1GB ) + # output = trt_gm(*inputs) end.record() torch.cuda.synchronize() times.append(start.elapsed_time(end)) - print("-----dynamo_path-----> compilation time:\n", times, "milliseconds") + print("----------------dynamo_compile----------------") + print("disable engine caching, used:", times[0], "ms") + print("enable engine caching to cache engines, used:", times[1], "ms") + print("enable engine caching to reuse engines, used:", times[2], "ms") # Custom Engine Cache @@ -84,11 +87,13 @@ def save( blob: bytes, prefix: str = "blob", ): + if not os.path.exists(self.engine_cache_dir): + os.makedirs(self.engine_cache_dir, exist_ok=True) + path = os.path.join( self.engine_cache_dir, f"{prefix}_{hash}.bin", ) - os.makedirs(path, exist_ok=True) with open(path, "wb") as f: f.write(blob) @@ -101,7 +106,7 @@ def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]: return None -def compile_path(iterations=3): +def torch_compile(iterations=3): times = [] engine_cache = MyEngineCache("/tmp/your_dir") start = torch.cuda.Event(enable_timing=True) @@ -112,8 +117,8 @@ def compile_path(iterations=3): # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. for i in range(iterations): - inputs = [torch.rand(size).to("cuda")] - # remove timing cache and reset dynamo for engine caching messurement + inputs = [torch.rand((100, 3, 224, 224)).to("cuda")] + # remove timing cache and reset dynamo just for engine caching messurement remove_timing_cache() torch._dynamo.reset() @@ -129,7 +134,7 @@ def compile_path(iterations=3): model, backend="tensorrt", options={ - "use_python_runtime": use_python_runtime, + "use_python_runtime": True, "enabled_precisions": enabled_precisions, "debug": debug, "min_block_size": min_block_size, @@ -144,9 +149,12 @@ def compile_path(iterations=3): torch.cuda.synchronize() times.append(start.elapsed_time(end)) - print("-----compile_path-----> compilation time:\n", times, "milliseconds") + print("----------------torch_compile----------------") + print("disable engine caching, used:", times[0], "ms") + print("enable engine caching to cache engines, used:", times[1], "ms") + print("enable engine caching to reuse engines, used:", times[2], "ms") if __name__ == "__main__": - dynamo_path() - # compile_path() + dynamo_compile() + torch_compile() diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 60c5012f77..16a1e0c75b 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -545,25 +545,31 @@ def run( "Found the cached engine that corresponds to this graph. It is directly loaded." ) + runtime = trt.Runtime(TRT_LOGGER) + engine = runtime.deserialize_cuda_engine(serialized_engine) + from torch_tensorrt.dynamo._refit import ( _refit_single_trt_engine_with_gm, ) - runtime = trt.Runtime(TRT_LOGGER) - engine = runtime.deserialize_cuda_engine(serialized_engine) - + # TODO: Fast refit is problematic for now. It will fail if the engine has batch_norm layers. + # We set weight_name_map=None to use slow refit anyway for now. Will fix it in the future. _refit_single_trt_engine_with_gm( new_gm=self.module, old_engine=engine, input_list=self.input_specs, settings=self.compilation_settings, - weight_name_map=weight_name_map, + weight_name_map=None, ) - serialized_engine = bytes(engine.serialize()) + serialized_engine = engine.serialize() + + with io.BytesIO() as engine_bytes: + engine_bytes.write(serialized_engine) + engine_str = engine_bytes.getvalue() return TRTInterpreterResult( - serialized_engine, + engine_str, self._input_names, self._output_names, self.weight_name_map, diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py new file mode 100644 index 0000000000..7b6247ced9 --- /dev/null +++ b/tests/py/dynamo/models/test_engine_cache.py @@ -0,0 +1,153 @@ +# type: ignore +import os +import shutil +import unittest +from typing import Optional + +import torch +import torch_tensorrt as torch_trt +import torchvision.models as models +from torch.testing._internal.common_utils import TestCase +from torch_tensorrt.dynamo._defaults import ENGINE_CACHE_DIR +from torch_tensorrt.dynamo._engine_caching import BaseEngineCache +from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity + +assertions = unittest.TestCase() + + +class MyEngineCache(BaseEngineCache): + def __init__( + self, + engine_cache_dir: str, + ) -> None: + self.engine_cache_dir = engine_cache_dir + + def save( + self, + hash: str, + blob: bytes, + prefix: str = "blob", + ): + if not os.path.exists(self.engine_cache_dir): + os.makedirs(self.engine_cache_dir, exist_ok=True) + + path = os.path.join( + self.engine_cache_dir, + f"{prefix}_{hash}.bin", + ) + with open(path, "wb") as f: + f.write(blob) + + def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]: + path = os.path.join(self.engine_cache_dir, f"{prefix}_{hash}.bin") + if os.path.exists(path): + with open(path, "rb") as f: + blob = f.read() + return blob + return None + + +class TestEngineCache(TestCase): + + def test_dynamo_compile(self): + model = models.resnet18(pretrained=True).eval().to("cuda") + example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) + # Mark the dim0 of inputs as dynamic + batch = torch.export.Dim("batch", min=1, max=200) + exp_program = torch.export.export( + model, args=example_inputs, dynamic_shapes={"x": {0: batch}} + ) + engine_cache_dir = ENGINE_CACHE_DIR + if os.path.exists(engine_cache_dir): + shutil.rmtree(engine_cache_dir) + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + inputs = [torch.rand((128, 3, 224, 224)).to("cuda")] + results = [] + for i in range(3): + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + trt_gm = torch_trt.dynamo.compile( + exp_program, + tuple(inputs), + use_python_runtime=False, + enabled_precisions={torch.float}, + debug=False, + min_block_size=1, + make_refitable=True, + cache_built_engines=cache_built_engines, + reuse_cached_engines=reuse_cached_engines, + engine_cache_size=1 << 30, # 1GB + ) + results.append(trt_gm(*inputs)) + + cos_sim = cosine_similarity(results[0], results[1]) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"test_dynamo_compile TRT without engine caching doesn't match with that with engine caching. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + cos_sim = cosine_similarity(results[1], results[2]) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"test_dynamo_compile TRT with engine caching doesn't match with that cached engine. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + def test_torch_compile(self): + # Custom Engine Cache + model = models.resnet18(pretrained=True).eval().to("cuda") + + engine_cache_dir = "/tmp/your_dir" + if os.path.exists(engine_cache_dir): + shutil.rmtree(engine_cache_dir) + + engine_cache = MyEngineCache(engine_cache_dir) + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + inputs = [torch.rand((100, 3, 224, 224)).to("cuda")] + results = [] + for i in range(3): + # remove timing cache and reset dynamo for engine caching messurement + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + compiled_model = torch.compile( + model, + backend="tensorrt", + options={ + "use_python_runtime": True, + "enabled_precisions": {torch.float}, + "debug": False, + "min_block_size": 1, + "make_refitable": True, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + "custom_engine_cache": engine_cache, # use custom engine cache + }, + ) + results.append(compiled_model(*inputs)) # trigger the compilation + + cos_sim = cosine_similarity(results[0], results[1]) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"test_torch_compile TRT without engine caching doesn't match with that with engine caching. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + cos_sim = cosine_similarity(results[1], results[2]) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"test_torch_compile TRT with engine caching doesn't match with that cached engine. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) From 16315dd5bc2f460748f59c35cca0e6a31f2f1177 Mon Sep 17 00:00:00 2001 From: Evan Li Date: Wed, 28 Aug 2024 14:41:48 -0700 Subject: [PATCH 11/14] fix issues from comments, add more unit tests --- py/torch_tensorrt/dynamo/_compiler.py | 24 ++- py/torch_tensorrt/dynamo/_settings.py | 4 - py/torch_tensorrt/dynamo/backend/backends.py | 7 +- .../dynamo/conversion/_TRTInterpreter.py | 25 ++- .../dynamo/conversion/_conversion.py | 10 +- py/torch_tensorrt/dynamo/utils.py | 19 +- tests/py/dynamo/models/test_engine_cache.py | 175 +++++++++++++++++- 7 files changed, 223 insertions(+), 41 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 349243769c..c28702f451 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -85,8 +85,8 @@ def compile( lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT, cache_built_engines: bool = _defaults.CACHE_BUILT_ENGINES, reuse_cached_engines: bool = _defaults.REUSE_CACHED_ENGINES, - engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR, - engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE, + engine_cache_dir: Optional[str] = _defaults.ENGINE_CACHE_DIR, + engine_cache_size: Optional[int] = _defaults.ENGINE_CACHE_SIZE, custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE, **kwargs: Any, ) -> torch.fx.GraphModule: @@ -155,8 +155,8 @@ def compile( lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime. cache_built_engines (bool): Whether to save the compiled TRT engines to storage reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage - engine_cache_dir (str): Directory to store the cached TRT engines - engine_cache_size (int): Maximum hard-disk space to use for the engine cache + engine_cache_dir (Optional[str]): Directory to store the cached TRT engines + engine_cache_size (Optional[int]): Maximum hard-disk space (bytes) to use for the engine cache, default is 1GB. If the cache exceeds this size, the oldest engines will be removed by default custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored. **kwargs: Any, Returns: @@ -235,12 +235,16 @@ def compile( gm = post_lowering(gm) logger.debug("Lowered Input graph: " + str(gm.graph)) + engine_cache = None if cache_built_engines or reuse_cached_engines: assert ( make_refitable ), "Engine caching requires make_refitable to be set to True" - if custom_engine_cache is None: - custom_engine_cache = DiskEngineCache(engine_cache_dir, engine_cache_size) + engine_cache = ( + custom_engine_cache + if custom_engine_cache is not None + else DiskEngineCache(engine_cache_dir, engine_cache_size) + ) compilation_options = { "enabled_precisions": ( @@ -277,12 +281,13 @@ def compile( "lazy_engine_init": lazy_engine_init, "cache_built_engines": cache_built_engines, "reuse_cached_engines": reuse_cached_engines, - "custom_engine_cache": custom_engine_cache, } settings = CompilationSettings(**compilation_options) logger.info("Compilation Settings: %s\n", settings) - trt_gm = compile_module(gm, trt_arg_inputs, trt_kwarg_inputs, settings) + trt_gm = compile_module( + gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache + ) return trt_gm @@ -291,6 +296,7 @@ def compile_module( sample_arg_inputs: Sequence[Input], sample_kwarg_inputs: Optional[dict[Any, Any]] = None, settings: CompilationSettings = CompilationSettings(), + engine_cache: Optional[BaseEngineCache] = None, ) -> torch.fx.GraphModule: """Compile a traced FX module @@ -301,6 +307,7 @@ def compile_module( arg_inputs: Inputs to the module kwarg_inputs: kwargs to the module settings: Compilation settings + engine_cache: Engine cache instance to store/load compiled engines Returns: Compiled FX GraphModule """ @@ -457,6 +464,7 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool: submodule_inputs, settings=settings, name=name, + engine_cache=engine_cache, ) trt_modules[name] = trt_module diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 0327727c9f..063f6f3718 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -7,7 +7,6 @@ from torch_tensorrt.dynamo._defaults import ( ASSUME_DYNAMIC_SHAPE_SUPPORT, CACHE_BUILT_ENGINES, - CUSTOM_ENGINE_CACHE, DEBUG, DISABLE_TF32, DLA_GLOBAL_DRAM_SIZE, @@ -36,7 +35,6 @@ WORKSPACE_SIZE, default_device, ) -from torch_tensorrt.dynamo._engine_caching import BaseEngineCache @dataclass @@ -80,7 +78,6 @@ class CompilationSettings: timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation cache_built_engines (bool): Whether to save the compiled TRT engines to storage reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage - custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache """ enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS) @@ -115,4 +112,3 @@ class CompilationSettings: lazy_engine_init: bool = LAZY_ENGINE_INIT cache_built_engines: bool = CACHE_BUILT_ENGINES reuse_cached_engines: bool = REUSE_CACHED_ENGINES - custom_engine_cache: Optional[BaseEngineCache] = CUSTOM_ENGINE_CACHE diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index ae3cb38f2d..605d963a50 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -48,14 +48,15 @@ def torch_tensorrt_backend( def aot_torch_tensorrt_aten_backend( gm: torch.fx.GraphModule, sample_inputs: Sequence[Any], **kwargs: Any ) -> torch.nn.Module: - settings = parse_dynamo_kwargs(kwargs) - return _pretraced_backend(gm, sample_inputs, settings) + settings, engine_cache = parse_dynamo_kwargs(kwargs) + return _pretraced_backend(gm, sample_inputs, settings, engine_cache) def _pretraced_backend( gm: torch.fx.GraphModule, sample_inputs: Sequence[Any], settings: CompilationSettings = CompilationSettings(), + engine_cache: Any = None, ) -> torch.fx.GraphModule | Callable[..., Any]: """Helper function to manage translation of traced FX module to TRT engines @@ -63,6 +64,7 @@ def _pretraced_backend( module: FX GraphModule to convert inputs: Inputs to the module settings: Compilation settings + engine_cache: Engine cache instance Returns: Compiled FX GraphModule """ @@ -109,6 +111,7 @@ def _pretraced_backend( gm, torchtrt_inputs, settings=settings, + engine_cache=engine_cache, ) return trt_compiled except (AssertionError, RuntimeError): diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 16a1e0c75b..22743af0aa 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -27,6 +27,7 @@ from torch_tensorrt._enums import dtype from torch_tensorrt._Input import Input from torch_tensorrt.dynamo import _defaults +from torch_tensorrt.dynamo._engine_caching import BaseEngineCache from torch_tensorrt.dynamo._settings import CompilationSettings from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( @@ -71,6 +72,7 @@ def __init__( logger_level: trt.ILogger.Severity = trt.ILogger.Severity.WARNING, output_dtypes: Optional[Sequence[dtype]] = None, compilation_settings: CompilationSettings = CompilationSettings(), + engine_cache: Optional[BaseEngineCache] = None, ): super().__init__(module) @@ -126,6 +128,9 @@ def __init__( self.const_mapping: Dict[str, Tuple[Sequence[int], str]] = {} self.weight_name_map: Optional[dict[str, Any]] = None + # Engine cache for storing and reusing TRT engines + self.engine_cache = engine_cache + def validate_conversion(self) -> Set[str]: missing_converters: Set[str] = set() @@ -521,22 +526,22 @@ def run( Return: TRTInterpreterResult """ - if ( - self.compilation_settings.custom_engine_cache is not None - ): # custom_engine_cache could be None if this function is called from convert_exported_program_to_serialized_trt_engine etc. + # self.engine_cache could be None if: + # 1) engine_cache is not passed in when calling this function like convert_exported_program_to_serialized_trt_engine etc., or + # 2) both cache_built_engines and reuse_cached_engines are False + if self.engine_cache is not None: if ( self.compilation_settings.cache_built_engines or self.compilation_settings.reuse_cached_engines ): - engine_cache = self.compilation_settings.custom_engine_cache - hash_val = engine_cache.get_hash(self.module) + hash_val = self.engine_cache.get_hash(self.module) if self.compilation_settings.reuse_cached_engines: # query the cached TRT engine - blob = engine_cache.load(hash_val) + blob = self.engine_cache.load(hash_val) if blob is not None: # hit the cache serialized_engine, input_names, output_names, weight_name_map = ( - engine_cache.unpack(blob) + self.engine_cache.unpack(blob) ) self._input_names = input_names self._output_names = output_names @@ -605,16 +610,16 @@ def run( builder_config, self.compilation_settings.timing_cache_path ) if ( - self.compilation_settings.custom_engine_cache is not None + self.engine_cache is not None and self.compilation_settings.cache_built_engines ): - blob = engine_cache.pack( + blob = self.engine_cache.pack( serialized_engine, self._input_names, self._output_names, self.weight_name_map, ) - engine_cache.save(hash_val, blob) + self.engine_cache.save(hash_val, blob) with io.BytesIO() as engine_bytes: engine_bytes.write(serialized_engine) diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index 36d7bd71e9..cd38ce56e6 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -10,6 +10,7 @@ from torch_tensorrt._enums import dtype from torch_tensorrt._features import ENABLED_FEATURES from torch_tensorrt._Input import Input +from torch_tensorrt.dynamo._engine_caching import BaseEngineCache from torch_tensorrt.dynamo._settings import CompilationSettings from torch_tensorrt.dynamo.conversion._TRTInterpreter import ( TRTInterpreter, @@ -76,6 +77,7 @@ def interpret_module_to_result( settings: CompilationSettings = CompilationSettings(), arg_inputs: Optional[Sequence[Input]] = None, kwarg_inputs: Optional[dict[str, Any]] = None, + engine_cache: Optional[BaseEngineCache] = None, ) -> TRTInterpreterResult: """Interpret an FX module to a TRTInterpreterResult Args: @@ -85,6 +87,7 @@ def interpret_module_to_result( arg_inputs: Sequence of Tensors representing inputs to the module. kwarg_inputs: A dictionary of Tensors representing inputs to the module. settings: Compilation settings + engine_cache: Engine cache instance Returns: TRTInterpreterResult """ @@ -111,6 +114,7 @@ def interpret_module_to_result( logger_level=(trt.Logger.VERBOSE if settings.debug else trt.Logger.WARNING), output_dtypes=output_dtypes, compilation_settings=settings, + engine_cache=engine_cache, ) interpreter_result = interpreter.run() @@ -122,6 +126,7 @@ def convert_module( inputs: Sequence[Input], settings: CompilationSettings = CompilationSettings(), name: str = "", + engine_cache: Optional[BaseEngineCache] = None, ) -> PythonTorchTensorRTModule | TorchTensorRTModule: """Convert an FX module to a TRT module Args: @@ -129,10 +134,13 @@ def convert_module( inputs: Sequence of Tensors representing inputs to the module settings: Compilation settings name: TRT engine name + engine_cache: Engine cache instance Returns: PythonTorchTensorRTModule or TorchTensorRTModule """ - interpreter_result = interpret_module_to_result(module, inputs, settings) + interpreter_result = interpret_module_to_result( + module, inputs, settings, engine_cache=engine_cache + ) rt_cls = PythonTorchTensorRTModule diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 460fd7a9f8..66192d59a0 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -3,7 +3,7 @@ import logging from dataclasses import fields, replace from enum import Enum -from typing import Any, Callable, Dict, Optional, Sequence, Union +from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union import numpy as np import tensorrt as trt @@ -13,6 +13,7 @@ from torch_tensorrt._enums import dtype from torch_tensorrt._Input import Input from torch_tensorrt.dynamo import _defaults +from torch_tensorrt.dynamo._engine_caching import BaseEngineCache from torch_tensorrt.dynamo._settings import CompilationSettings from packaging import version @@ -438,7 +439,9 @@ def to_torch_tensorrt_device( return Device._from(device) -def parse_dynamo_kwargs(kwargs: Any) -> CompilationSettings: +def parse_dynamo_kwargs( + kwargs: Any, +) -> Tuple[CompilationSettings, Optional[BaseEngineCache]]: """Parses the kwargs field of a Dynamo backend Args: @@ -497,11 +500,15 @@ def parse_dynamo_kwargs(kwargs: Any) -> CompilationSettings: # If cache_built_engines and reuse_cached_engines are True but custom_engine_cache is not provided, # then create a default disk engine cache + engine_cache = None if kwargs.get("cache_built_engines") or kwargs.get("reuse_cached_engines"): assert kwargs.get( "make_refitable" ), "Engine caching requires make_refitable to be set to True" - if settings.custom_engine_cache is None: + + if kwargs.get("custom_engine_cache") is not None: + engine_cache = kwargs.get("custom_engine_cache") + else: from torch_tensorrt.dynamo._engine_caching import DiskEngineCache engine_cache_dir = kwargs.get( @@ -510,13 +517,11 @@ def parse_dynamo_kwargs(kwargs: Any) -> CompilationSettings: engine_cache_size = kwargs.get( "engine_cache_size", _defaults.ENGINE_CACHE_SIZE ) - settings.custom_engine_cache = DiskEngineCache( - engine_cache_dir, engine_cache_size - ) + engine_cache = DiskEngineCache(engine_cache_dir, engine_cache_size) logger.info("Compilation Settings: %s\n", settings) - return settings + return settings, engine_cache def req_torch_version(min_torch_version: str = "2.dev") -> Callable[..., Any]: diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py index 7b6247ced9..1a5b874eb4 100644 --- a/tests/py/dynamo/models/test_engine_cache.py +++ b/tests/py/dynamo/models/test_engine_cache.py @@ -49,7 +49,7 @@ def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]: class TestEngineCache(TestCase): - def test_dynamo_compile(self): + def test_dynamo_compile_with_default_disk_engine_cache(self): model = models.resnet18(pretrained=True).eval().to("cuda") example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) # Mark the dim0 of inputs as dynamic @@ -57,15 +57,87 @@ def test_dynamo_compile(self): exp_program = torch.export.export( model, args=example_inputs, dynamic_shapes={"x": {0: batch}} ) + engine_cache_dir = ENGINE_CACHE_DIR if os.path.exists(engine_cache_dir): shutil.rmtree(engine_cache_dir) + + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + inputs = [torch.rand((128, 3, 224, 224)).to("cuda")] + results = [] + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + for i in range(3): + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + start.record() + trt_gm = torch_trt.dynamo.compile( + exp_program, + tuple(inputs), + use_python_runtime=False, + enabled_precisions={torch.float}, + debug=False, + min_block_size=1, + make_refitable=True, + cache_built_engines=cache_built_engines, + reuse_cached_engines=reuse_cached_engines, + ) + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + results.append(trt_gm(*inputs)) + + cos_sim = cosine_similarity(results[0], results[1]) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"test_dynamo_compile_with_default_disk_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + cos_sim = cosine_similarity(results[1], results[2]) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"test_dynamo_compile_with_default_disk_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + assertions.assertTrue( + times[0] > times[2], + msg=f"test_dynamo_compile_with_default_disk_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", + ) + + def test_dynamo_compile_with_custom_engine_cache(self): + model = models.resnet18(pretrained=True).eval().to("cuda") + + engine_cache_dir = "/tmp/your_dir" + if os.path.exists(engine_cache_dir): + shutil.rmtree(engine_cache_dir) + + custom_engine_cache = MyEngineCache(engine_cache_dir) + + example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) + # Mark the dim0 of inputs as dynamic + batch = torch.export.Dim("batch", min=1, max=200) + exp_program = torch.export.export( + model, args=example_inputs, dynamic_shapes={"x": {0: batch}} + ) + # The 1st iteration is to measure the compilation time without engine caching # The 2nd and 3rd iterations are to measure the compilation time with engine caching. # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. inputs = [torch.rand((128, 3, 224, 224)).to("cuda")] results = [] + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) for i in range(3): if i == 0: cache_built_engines = False @@ -74,6 +146,7 @@ def test_dynamo_compile(self): cache_built_engines = True reuse_cached_engines = True + start.record() trt_gm = torch_trt.dynamo.compile( exp_program, tuple(inputs), @@ -84,23 +157,95 @@ def test_dynamo_compile(self): make_refitable=True, cache_built_engines=cache_built_engines, reuse_cached_engines=reuse_cached_engines, - engine_cache_size=1 << 30, # 1GB + custom_engine_cache=custom_engine_cache, ) + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) results.append(trt_gm(*inputs)) cos_sim = cosine_similarity(results[0], results[1]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_dynamo_compile TRT without engine caching doesn't match with that with engine caching. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"test_dynamo_compile_with_custom_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) cos_sim = cosine_similarity(results[1], results[2]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_dynamo_compile TRT with engine caching doesn't match with that cached engine. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"test_dynamo_compile_with_custom_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) - def test_torch_compile(self): + assertions.assertTrue( + times[0] > times[2], + msg=f"test_dynamo_compile_with_custom_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", + ) + + def test_torch_compile_with_default_disk_engine_cache(self): + # Custom Engine Cache + model = models.resnet18(pretrained=True).eval().to("cuda") + + engine_cache_dir = "/tmp/test_torch_compile_with_default_disk_engine_cache" + if os.path.exists(engine_cache_dir): + shutil.rmtree(engine_cache_dir) + + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + inputs = [torch.rand((100, 3, 224, 224)).to("cuda")] + results = [] + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + for i in range(3): + # remove timing cache and reset dynamo for engine caching messurement + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + start.record() + compiled_model = torch.compile( + model, + backend="tensorrt", + options={ + "use_python_runtime": True, + "enabled_precisions": {torch.float}, + "debug": False, + "min_block_size": 1, + "make_refitable": True, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + "engine_cache_dir": engine_cache_dir, + "engine_cache_size": 1 << 30, # 1GB + }, + ) + results.append(compiled_model(*inputs)) # trigger the compilation + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + cos_sim = cosine_similarity(results[0], results[1]) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"test_torch_compile_with_default_disk_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + cos_sim = cosine_similarity(results[1], results[2]) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"test_torch_compile_with_default_disk_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + assertions.assertTrue( + times[0] > times[2], + msg=f"test_torch_compile_with_default_disk_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", + ) + + def test_torch_compile_with_custom_engine_cache(self): # Custom Engine Cache model = models.resnet18(pretrained=True).eval().to("cuda") @@ -108,13 +253,16 @@ def test_torch_compile(self): if os.path.exists(engine_cache_dir): shutil.rmtree(engine_cache_dir) - engine_cache = MyEngineCache(engine_cache_dir) + custom_engine_cache = MyEngineCache(engine_cache_dir) # The 1st iteration is to measure the compilation time without engine caching # The 2nd and 3rd iterations are to measure the compilation time with engine caching. # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. inputs = [torch.rand((100, 3, 224, 224)).to("cuda")] results = [] + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) for i in range(3): # remove timing cache and reset dynamo for engine caching messurement if i == 0: @@ -124,6 +272,7 @@ def test_torch_compile(self): cache_built_engines = True reuse_cached_engines = True + start.record() compiled_model = torch.compile( model, backend="tensorrt", @@ -135,19 +284,27 @@ def test_torch_compile(self): "make_refitable": True, "cache_built_engines": cache_built_engines, "reuse_cached_engines": reuse_cached_engines, - "custom_engine_cache": engine_cache, # use custom engine cache + "custom_engine_cache": custom_engine_cache, }, ) results.append(compiled_model(*inputs)) # trigger the compilation + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) cos_sim = cosine_similarity(results[0], results[1]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_torch_compile TRT without engine caching doesn't match with that with engine caching. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"test_torch_compile_with_custom_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) cos_sim = cosine_similarity(results[1], results[2]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_torch_compile TRT with engine caching doesn't match with that cached engine. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"test_torch_compile_with_custom_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + assertions.assertTrue( + times[0] > times[2], + msg=f"test_torch_compile_with_custom_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", ) From f3e4234f2f5d710895b48d6205b5f799bbc4bd19 Mon Sep 17 00:00:00 2001 From: Evan Li Date: Wed, 28 Aug 2024 18:31:35 -0700 Subject: [PATCH 12/14] fix CI errors --- .../conversion/test_bitwise_and_aten.py | 7 ++++- .../conversion/test_embedding_bag_aten.py | 7 ++++- .../conversion/test_index_select_aten.py | 7 ++++- tests/py/dynamo/models/test_dtype_support.py | 14 ++++++++++ tests/py/dynamo/models/test_dyn_models.py | 14 ++++++++++ tests/py/dynamo/models/test_engine_cache.py | 26 ++++++++++--------- .../dynamo/models/test_export_kwargs_serde.py | 14 ++++++++++ tests/py/dynamo/models/test_export_serde.py | 23 +++++++++++++++- tests/py/dynamo/models/test_models.py | 10 +++++++ tests/py/dynamo/models/test_models_export.py | 14 ++++++++++ 10 files changed, 120 insertions(+), 16 deletions(-) diff --git a/tests/py/dynamo/conversion/test_bitwise_and_aten.py b/tests/py/dynamo/conversion/test_bitwise_and_aten.py index a29a8061db..c42fd2e61f 100644 --- a/tests/py/dynamo/conversion/test_bitwise_and_aten.py +++ b/tests/py/dynamo/conversion/test_bitwise_and_aten.py @@ -141,7 +141,12 @@ def forward(self, lhs_val, rhs_val): mod, inputs, dynamic_shapes=({1: dyn_dim}, {0: dyn_dim}) ) trt_mod = torch_tensorrt.dynamo.compile( - fx_mod, inputs=inputs, enable_precisions={torch.bool}, min_block_size=1 + fx_mod, + inputs=inputs, + enable_precisions={torch.bool}, + min_block_size=1, + cache_built_engines=False, + reuse_cached_engines=False, ) with torch.no_grad(): cuda_inputs = [] diff --git a/tests/py/dynamo/conversion/test_embedding_bag_aten.py b/tests/py/dynamo/conversion/test_embedding_bag_aten.py index d935134ff2..3fef3d70cf 100644 --- a/tests/py/dynamo/conversion/test_embedding_bag_aten.py +++ b/tests/py/dynamo/conversion/test_embedding_bag_aten.py @@ -484,7 +484,12 @@ def forward(self, weights, indices, offsets, per_sample_weights=None): dynamic_shapes["per_sample_weights"] = {} fx_mod = torch.export.export(mod, inputs, dynamic_shapes=dynamic_shapes) trt_mod = torch_tensorrt.dynamo.compile( - fx_mod, inputs=inputs, enable_precisions=torch.float32, min_block_size=1 + fx_mod, + inputs=inputs, + enable_precisions=torch.float32, + min_block_size=1, + cache_built_engines=False, + reuse_cached_engines=False, ) # use the inputs with different shape to inference: if per_sample_weights is None: diff --git a/tests/py/dynamo/conversion/test_index_select_aten.py b/tests/py/dynamo/conversion/test_index_select_aten.py index 3d0b41b791..b1339efdcf 100644 --- a/tests/py/dynamo/conversion/test_index_select_aten.py +++ b/tests/py/dynamo/conversion/test_index_select_aten.py @@ -109,7 +109,12 @@ def forward(self, source_tensor, indice_tensor): fx_mod = torch.export.export(mod, inputs, dynamic_shapes=dynamic_shapes) trt_mod = torch_tensorrt.dynamo.compile( - fx_mod, inputs=inputs, enable_precisions=torch.float32, min_block_size=1 + fx_mod, + inputs=inputs, + enable_precisions=torch.float32, + min_block_size=1, + cache_built_engines=False, + reuse_cached_engines=False, ) # use different shape of inputs for inference: inputs = (source_tensor_1, indice_tensor) diff --git a/tests/py/dynamo/models/test_dtype_support.py b/tests/py/dynamo/models/test_dtype_support.py index 29faf4eff3..b486784e52 100644 --- a/tests/py/dynamo/models/test_dtype_support.py +++ b/tests/py/dynamo/models/test_dtype_support.py @@ -41,6 +41,8 @@ def forward(self, x): truncate_double=True, min_block_size=1, use_python_runtime=False, + cache_built_engines=False, + reuse_cached_engines=False, ) torch_model_results = mod(in_tensor) @@ -79,6 +81,8 @@ def forward(self, x): truncate_double=True, min_block_size=1, use_python_runtime=True, + cache_built_engines=False, + reuse_cached_engines=False, ) torch_model_results = mod(in_tensor) @@ -123,6 +127,8 @@ def forward(self, x): truncate_double=False, min_block_size=1, use_python_runtime=False, + cache_built_engines=False, + reuse_cached_engines=False, ) torch_model_results = mod(in_tensor) @@ -162,6 +168,8 @@ def forward(self, x): truncate_double=False, min_block_size=1, use_python_runtime=True, + cache_built_engines=False, + reuse_cached_engines=False, ) torch_model_results = mod(in_tensor) @@ -214,6 +222,8 @@ def forward(self, x): enabled_precisions={torch.float, torch.bfloat16, torch.half}, min_block_size=1, use_python_runtime=False, + cache_built_engines=False, + reuse_cached_engines=False, ) torch_model_results = mod(in_tensor) @@ -252,6 +262,8 @@ def forward(self, x): enabled_precisions={torch.float, torch.bfloat16, torch.half}, min_block_size=1, use_python_runtime=True, + cache_built_engines=False, + reuse_cached_engines=False, ) torch_model_results = mod(in_tensor) @@ -289,6 +301,8 @@ def forward(self, x): debug=True, min_block_size=1, device=device, + cache_built_engines=False, + reuse_cached_engines=False, ) torch_model_results = mod(*inputs) diff --git a/tests/py/dynamo/models/test_dyn_models.py b/tests/py/dynamo/models/test_dyn_models.py index 67eaddcc6c..d5627499f5 100644 --- a/tests/py/dynamo/models/test_dyn_models.py +++ b/tests/py/dynamo/models/test_dyn_models.py @@ -36,6 +36,8 @@ def forward(self, x): "ir": ir, "pass_through_build_failures": True, "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, } if ir == "torch_compile": input_bs4 = torch.randn((4, 3, 224, 224)).to("cuda") @@ -90,6 +92,8 @@ def forward(self, x): "pass_through_build_failures": True, "torch_executed_ops": {"torch.ops.aten.abs.default"}, "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, } if ir == "torch_compile": @@ -141,6 +145,8 @@ def forward(self, x): "ir": ir, "pass_through_build_failures": True, "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, } if ir == "torch_compile": @@ -184,6 +190,8 @@ def test_resnet_dynamic(ir): "ir": ir, "pass_through_build_failures": True, "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, } if ir == "torch_compile": @@ -246,6 +254,8 @@ def forward(self, x): "pass_through_build_failures": True, "optimization_level": 1, "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -278,6 +288,8 @@ def forward(self, x): "enabled_precisions": {torch.float}, "ir": ir, "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, } inputs_bs2 = torch.randn(2, 2, 10).to("cuda") if ir == "torch_compile": @@ -332,6 +344,8 @@ def forward(self, x): "pass_through_build_failures": True, "min_block_size": 1, "torch_executed_ops": {"torch.ops.aten.add.Tensor"}, + "cache_built_engines": False, + "reuse_cached_engines": False, } # Compile the model diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py index 1a5b874eb4..24bb96c4f2 100644 --- a/tests/py/dynamo/models/test_engine_cache.py +++ b/tests/py/dynamo/models/test_engine_cache.py @@ -21,6 +21,8 @@ def __init__( engine_cache_dir: str, ) -> None: self.engine_cache_dir = engine_cache_dir + if not os.path.exists(self.engine_cache_dir): + os.makedirs(self.engine_cache_dir, exist_ok=True) def save( self, @@ -99,18 +101,18 @@ def test_dynamo_compile_with_default_disk_engine_cache(self): cos_sim = cosine_similarity(results[0], results[1]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_dynamo_compile_with_default_disk_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) cos_sim = cosine_similarity(results[1], results[2]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_dynamo_compile_with_default_disk_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) assertions.assertTrue( times[0] > times[2], - msg=f"test_dynamo_compile_with_default_disk_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", + msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", ) def test_dynamo_compile_with_custom_engine_cache(self): @@ -167,18 +169,18 @@ def test_dynamo_compile_with_custom_engine_cache(self): cos_sim = cosine_similarity(results[0], results[1]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_dynamo_compile_with_custom_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) cos_sim = cosine_similarity(results[1], results[2]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_dynamo_compile_with_custom_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) assertions.assertTrue( times[0] > times[2], - msg=f"test_dynamo_compile_with_custom_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", + msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", ) def test_torch_compile_with_default_disk_engine_cache(self): @@ -231,18 +233,18 @@ def test_torch_compile_with_default_disk_engine_cache(self): cos_sim = cosine_similarity(results[0], results[1]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_torch_compile_with_default_disk_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) cos_sim = cosine_similarity(results[1], results[2]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_torch_compile_with_default_disk_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) assertions.assertTrue( times[0] > times[2], - msg=f"test_torch_compile_with_default_disk_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", + msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", ) def test_torch_compile_with_custom_engine_cache(self): @@ -295,16 +297,16 @@ def test_torch_compile_with_custom_engine_cache(self): cos_sim = cosine_similarity(results[0], results[1]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_torch_compile_with_custom_engine_cache: results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"results[0] doesn't match with results[1]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) cos_sim = cosine_similarity(results[1], results[2]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"test_torch_compile_with_custom_engine_cache: results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"results[1] doesn't match with results[2]. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) assertions.assertTrue( times[0] > times[2], - msg=f"test_torch_compile_with_custom_engine_cache: Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", + msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", ) diff --git a/tests/py/dynamo/models/test_export_kwargs_serde.py b/tests/py/dynamo/models/test_export_kwargs_serde.py index 08b23d55e0..52a927e518 100644 --- a/tests/py/dynamo/models/test_export_kwargs_serde.py +++ b/tests/py/dynamo/models/test_export_kwargs_serde.py @@ -63,6 +63,8 @@ def forward(self, x, b=5, c=None, d=None): "optimization_level": 1, "min_block_size": 1, "ir": "dynamo", + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torch.export.export(model, args=tuple(args), kwargs=kwargs) @@ -122,6 +124,8 @@ def forward(self, x, b=5, c=None, d=None): "optimization_level": 1, "min_block_size": 1, "ir": "dynamo", + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -190,6 +194,8 @@ def forward(self, x, b=5, c=None, d=None): "optimization_level": 1, "min_block_size": 1, "ir": "dynamo", + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -271,6 +277,8 @@ def forward(self, x, b=None, c=None, d=None, e=[]): "optimization_level": 1, "min_block_size": 1, "ir": "dynamo", + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -358,6 +366,8 @@ def forward(self, x, b=None, c=None, d=None, e=[]): "optimization_level": 1, "min_block_size": 1, "ir": "dynamo", + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -444,6 +454,8 @@ def forward(self, x, b=None, c=None, d=None, e=[]): "optimization_level": 1, "min_block_size": 1, "ir": "dynamo", + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -505,6 +517,8 @@ def forward(self, x, b=5, c=None, d=None): "optimization_level": 1, "min_block_size": 1, "ir": "dynamo", + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torch.export.export(model, args=tuple(args), kwargs=kwargs) diff --git a/tests/py/dynamo/models/test_export_serde.py b/tests/py/dynamo/models/test_export_serde.py index c0c0ba0f22..146cc2addf 100644 --- a/tests/py/dynamo/models/test_export_serde.py +++ b/tests/py/dynamo/models/test_export_serde.py @@ -42,6 +42,8 @@ def forward(self, x): ], "ir": ir, "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -94,6 +96,8 @@ def forward(self, x): ], "ir": ir, "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -150,6 +154,8 @@ def forward(self, x): ) ], "ir": ir, + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -209,6 +215,8 @@ def forward(self, x): "ir": ir, "min_block_size": 1, "torch_executed_ops": {"torch.ops.aten.relu.default"}, + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -250,6 +258,8 @@ def test_resnet18(ir): ], "ir": ir, "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -293,6 +303,8 @@ def test_resnet18_dynamic(ir): ], "ir": ir, "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -340,6 +352,8 @@ def forward(self, x): "ir": ir, "min_block_size": 1, "torch_executed_ops": {"torch.ops.aten.convolution.default"}, + "cache_built_engines": False, + "reuse_cached_engines": False, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -388,7 +402,14 @@ def forward(self, x): model = MyModule().eval().cuda() input = torch.randn((1, 3, 224, 224)).to("cuda") - trt_gm = torchtrt.compile(model, ir=ir, inputs=[input], min_block_size=1) + trt_gm = torchtrt.compile( + model, + ir=ir, + inputs=[input], + min_block_size=1, + cache_built_engines=False, + reuse_cached_engines=False, + ) assertions.assertTrue( isinstance(trt_gm, torch.fx.GraphModule), msg=f"test_save_load_ts output type does not match with torch.fx.GraphModule", diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py index 2d45af2b49..ba6cb0c776 100644 --- a/tests/py/dynamo/models/test_models.py +++ b/tests/py/dynamo/models/test_models.py @@ -30,6 +30,8 @@ def test_resnet18(ir): "pass_through_build_failures": True, "optimization_level": 1, "ir": "torch_compile", + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -61,6 +63,8 @@ def test_mobilenet_v2(ir): "optimization_level": 1, "min_block_size": 10, "ir": "torch_compile", + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -92,6 +96,8 @@ def test_efficientnet_b0(ir): "optimization_level": 1, "min_block_size": 10, "ir": "torch_compile", + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -132,6 +138,8 @@ def test_bert_base_uncased(ir): "optimization_level": 1, "min_block_size": 15, "ir": "torch_compile", + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -166,6 +174,8 @@ def test_resnet18_half(ir): "pass_through_build_failures": True, "optimization_level": 1, "ir": "torch_compile", + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py index df71d6b58a..bf19c3c5e6 100644 --- a/tests/py/dynamo/models/test_models_export.py +++ b/tests/py/dynamo/models/test_models_export.py @@ -31,6 +31,8 @@ def test_resnet18(ir): "pass_through_build_failures": True, "optimization_level": 1, "min_block_size": 8, + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -61,6 +63,8 @@ def test_mobilenet_v2(ir): "pass_through_build_failures": True, "optimization_level": 1, "min_block_size": 8, + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -91,6 +95,8 @@ def test_efficientnet_b0(ir): "pass_through_build_failures": True, "optimization_level": 1, "min_block_size": 8, + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -130,6 +136,8 @@ def test_bert_base_uncased(ir): "truncate_double": True, "ir": ir, "min_block_size": 10, + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) model_outputs = model(input, input2) @@ -168,6 +176,8 @@ def test_resnet18_half(ir): "pass_through_build_failures": True, "optimization_level": 1, "min_block_size": 8, + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -223,6 +233,8 @@ def calibrate_loop(model): enabled_precisions={torch.float8_e4m3fn}, min_block_size=1, debug=True, + cache_built_engines=False, + reuse_cached_engines=False, ) outputs_trt = trt_model(input_tensor) assert torch.allclose(output_pyt, outputs_trt, rtol=1e-3, atol=1e-2) @@ -272,6 +284,8 @@ def calibrate_loop(model): enabled_precisions={torch.int8}, min_block_size=1, debug=True, + cache_built_engines=False, + reuse_cached_engines=False, ) outputs_trt = trt_model(input_tensor) assert torch.allclose(output_pyt, outputs_trt, rtol=1e-3, atol=1e-2) From 42d18ac1402e87b19d98ba1d85e96d8bcb79cb2a Mon Sep 17 00:00:00 2001 From: Evan Li Date: Wed, 28 Aug 2024 22:23:01 -0700 Subject: [PATCH 13/14] fix CI errors --- tests/py/dynamo/models/test_engine_cache.py | 7 +++++++ tests/py/dynamo/runtime/test_001_streams.py | 2 ++ tests/py/dynamo/runtime/test_002_lazy_engine_init.py | 10 ++++++++++ 3 files changed, 19 insertions(+) diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py index 24bb96c4f2..46916f7c62 100644 --- a/tests/py/dynamo/models/test_engine_cache.py +++ b/tests/py/dynamo/models/test_engine_cache.py @@ -4,6 +4,7 @@ import unittest from typing import Optional +import pytest import torch import torch_tensorrt as torch_trt import torchvision.models as models @@ -183,6 +184,9 @@ def test_dynamo_compile_with_custom_engine_cache(self): msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", ) + @pytest.mark.skip( + reason="The test needs a fix for refit, which is reported in https://github.com/pytorch/TensorRT/issues/3126" + ) def test_torch_compile_with_default_disk_engine_cache(self): # Custom Engine Cache model = models.resnet18(pretrained=True).eval().to("cuda") @@ -247,6 +251,9 @@ def test_torch_compile_with_default_disk_engine_cache(self): msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", ) + @pytest.mark.skip( + reason="The test needs a fix for refit, which is reported in https://github.com/pytorch/TensorRT/issues/3126" + ) def test_torch_compile_with_custom_engine_cache(self): # Custom Engine Cache model = models.resnet18(pretrained=True).eval().to("cuda") diff --git a/tests/py/dynamo/runtime/test_001_streams.py b/tests/py/dynamo/runtime/test_001_streams.py index 574db6611e..aaec9e3d41 100644 --- a/tests/py/dynamo/runtime/test_001_streams.py +++ b/tests/py/dynamo/runtime/test_001_streams.py @@ -31,6 +31,8 @@ def forward(self, x): enabled_precisions={dtype}, min_block_size=1, device=device, + cache_built_engines=False, + reuse_cached_engines=False, ) for i in range(100): diff --git a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py index 1f3de69eb3..008b0f53b1 100644 --- a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py +++ b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py @@ -160,6 +160,8 @@ def test_lazy_engine_init_py_e2e(self): "ir": "dynamo", "lazy_engine_init": True, "use_python_runtime": True, + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -194,6 +196,8 @@ def test_lazy_engine_init_cpp_e2e(self): "ir": "dynamo", "lazy_engine_init": True, "use_python_runtime": False, + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -228,6 +232,8 @@ def test_lazy_engine_init_cpp_serialization(self): "ir": "dynamo", "lazy_engine_init": True, "use_python_runtime": False, + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.compile(model, **compile_spec) @@ -276,6 +282,8 @@ def forward(self, a, b): "lazy_engine_init": True, "use_python_runtime": True, "torch_executed_ops": [torch.ops.aten.sub.Tensor], + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.dynamo.compile(exp_program, **compile_spec) @@ -318,6 +326,8 @@ def forward(self, a, b): "lazy_engine_init": True, "use_python_runtime": False, "torch_executed_ops": [torch.ops.aten.sub.Tensor], + "cache_built_engines": False, + "reuse_cached_engines": False, } trt_mod = torchtrt.dynamo.compile(exp_program, **compile_spec) From fc525e6068be472f58794e355d5fdb31f22b9c48 Mon Sep 17 00:00:00 2001 From: Evan Li Date: Thu, 29 Aug 2024 11:45:20 -0700 Subject: [PATCH 14/14] fix refit issue for torch.compile --- py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py | 5 ++--- tests/py/dynamo/models/test_engine_cache.py | 6 ------ 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 22743af0aa..3c97c8347a 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -40,7 +40,7 @@ get_node_name, get_trt_tensor, ) -from torch_tensorrt.dynamo.utils import DYNAMIC_DIM, to_torch_device +from torch_tensorrt.dynamo.utils import DYNAMIC_DIM, get_model_device, to_torch_device from torch_tensorrt.fx.observer import Observer from torch_tensorrt.logging import TRT_LOGGER @@ -434,9 +434,8 @@ def _save_weight_mapping(self) -> None: """ _LOGGER.info("Building weight name mapping...") # Stage 1: Name mapping - sd = self.module.state_dict() torch_device = to_torch_device(self.compilation_settings.device) - gm_is_on_cuda = list(sd.values())[0].device.type == "cuda" + gm_is_on_cuda = get_model_device(self.module).type == "cuda" if not gm_is_on_cuda: # If the model original position is on CPU, move it GPU sd = { diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py index 46916f7c62..189a492d4e 100644 --- a/tests/py/dynamo/models/test_engine_cache.py +++ b/tests/py/dynamo/models/test_engine_cache.py @@ -184,9 +184,6 @@ def test_dynamo_compile_with_custom_engine_cache(self): msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", ) - @pytest.mark.skip( - reason="The test needs a fix for refit, which is reported in https://github.com/pytorch/TensorRT/issues/3126" - ) def test_torch_compile_with_default_disk_engine_cache(self): # Custom Engine Cache model = models.resnet18(pretrained=True).eval().to("cuda") @@ -251,9 +248,6 @@ def test_torch_compile_with_default_disk_engine_cache(self): msg=f"Engine caching didn't speed up the compilation. Time taken without engine caching: {times[0]} ms, time taken with engine caching: {times[2]} ms", ) - @pytest.mark.skip( - reason="The test needs a fix for refit, which is reported in https://github.com/pytorch/TensorRT/issues/3126" - ) def test_torch_compile_with_custom_engine_cache(self): # Custom Engine Cache model = models.resnet18(pretrained=True).eval().to("cuda")