diff --git a/core/conversion/conversionctx/ConversionCtx.cpp b/core/conversion/conversionctx/ConversionCtx.cpp index 025b4fb1c1..0d7b7084d9 100644 --- a/core/conversion/conversionctx/ConversionCtx.cpp +++ b/core/conversion/conversionctx/ConversionCtx.cpp @@ -20,6 +20,7 @@ std::ostream& operator<<(std::ostream& os, const BuilderSettings& s) { << "\n Debuggable Engine: " << s.debug \ << "\n GPU ID: " << s.device.gpu_id \ << "\n Allow GPU Fallback (if running on DLA): " << s.device.allow_gpu_fallback \ + << "\n Min Timing Iterations: " << s.num_min_timing_iters \ << "\n Avg Timing Iterations: " << s.num_avg_timing_iters \ << "\n Max Workspace Size: " << s.workspace_size; @@ -103,6 +104,7 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings) cfg->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); } + cfg->setMinTimingIterations(settings.num_min_timing_iters); cfg->setAvgTimingIterations(settings.num_avg_timing_iters); cfg->setMaxWorkspaceSize(settings.workspace_size); cfg->setDefaultDeviceType(settings.device.device_type); diff --git a/cpp/bin/torchtrtc/main.cpp b/cpp/bin/torchtrtc/main.cpp index 0375e67347..f43642584e 100644 --- a/cpp/bin/torchtrtc/main.cpp +++ b/cpp/bin/torchtrtc/main.cpp @@ -113,6 +113,8 @@ int main(int argc, char** argv) { "Whether to treat input file as a serialized TensorRT engine and embed it into a TorchScript module (device spec must be provided)", {"embed-engine"}); + args::ValueFlag num_min_timing_iters( + parser, "num_iters", "Number of minimization timing iterations used to select kernels", {"num-min-timing-iter"}); args::ValueFlag num_avg_timing_iters( parser, "num_iters", "Number of averaging timing iterations used to select kernels", {"num-avg-timing-iters"}); args::ValueFlag workspace_size( diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp index e44d283334..3058b23ce0 100644 --- a/cpp/src/compile_spec.cpp +++ b/cpp/src/compile_spec.cpp @@ -81,6 +81,7 @@ torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) { internal.convert_info.engine_settings.device.gpu_id = external.device.gpu_id; internal.convert_info.engine_settings.device.dla_core = external.device.dla_core; + internal.convert_info.engine_settings.num_min_timing_iters = external.num_min_timing_iters; internal.convert_info.engine_settings.num_avg_timing_iters = external.num_avg_timing_iters; internal.convert_info.engine_settings.workspace_size = external.workspace_size; diff --git a/docsrc/tutorials/use_from_pytorch.rst b/docsrc/tutorials/use_from_pytorch.rst index 25348b2ac8..0c616e9414 100644 --- a/docsrc/tutorials/use_from_pytorch.rst +++ b/docsrc/tutorials/use_from_pytorch.rst @@ -45,6 +45,7 @@ at the documentation for the Torch-TensorRT ``TensorRTCompileSpec`` API. "allow_gpu_fallback": True }, "capability": torch_tensorrt.EngineCapability.default, + "num_min_timing_iters": 2, "num_avg_timing_iters": 1, }) } diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp index 91f482e7e9..a89fe692bd 100644 --- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp +++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp @@ -221,6 +221,8 @@ core::CompileSpec CompileSpec::toInternalCompileSpec() { info.convert_info.engine_settings.truncate_long_and_double = truncate_long_and_double; info.convert_info.engine_settings.capability = toTRTEngineCapability(capability); + TORCHTRT_CHECK(num_min_timing_iters >= 0, "num_min_timing_iters must be 0 or greater"); + info.convert_info.engine_settings.num_min_timing_iters = num_min_timing_iters; TORCHTRT_CHECK(num_avg_timing_iters >= 0, "num_avg_timing_iters must be 0 or greater"); info.convert_info.engine_settings.num_avg_timing_iters = num_avg_timing_iters; TORCHTRT_CHECK(workspace_size >= 0, "workspace_size must be 0 or greater"); @@ -247,6 +249,7 @@ std::string CompileSpec::stringify() { ss << " \"Debug\": " << debug << std::endl; ss << " \"Device\": " << device.to_str() << std::endl; ss << " \"Engine Capability\": " << to_str(capability) << std::endl; + ss << " \"Num Min Timing Iters\": " << num_min_timing_iters << std::endl; ss << " \"Num Avg Timing Iters\": " << num_avg_timing_iters << std::endl; ss << " \"Workspace Size\": " << workspace_size << std::endl; ss << " \"Truncate long and double\": " << truncate_long_and_double << std::endl; diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.h b/py/torch_tensorrt/csrc/tensorrt_classes.h index 04a6e01143..0c80641005 100644 --- a/py/torch_tensorrt/csrc/tensorrt_classes.h +++ b/py/torch_tensorrt/csrc/tensorrt_classes.h @@ -147,6 +147,7 @@ struct CompileSpec : torch::CustomClassHolder { ADD_FIELD_GET_SET(refit, bool); ADD_FIELD_GET_SET(debug, bool); ADD_ENUM_GET_SET(capability, EngineCapability, static_cast(EngineCapability::kSAFE_DLA)); + ADD_FIELD_GET_SET(num_min_timing_iters, int64_t); ADD_FIELD_GET_SET(num_avg_timing_iters, int64_t); ADD_FIELD_GET_SET(workspace_size, int64_t); ADD_FIELD_GET_SET(truncate_long_and_double, bool); @@ -165,6 +166,7 @@ struct CompileSpec : torch::CustomClassHolder { Device device; TorchFallback torch_fallback; EngineCapability capability = EngineCapability::kDEFAULT; + int64_t num_min_timing_iters = 2; int64_t num_avg_timing_iters = 1; int64_t workspace_size = 0; }; diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py index b462470cef..e406096677 100644 --- a/py/torch_tensorrt/ts/_compile_spec.py +++ b/py/torch_tensorrt/ts/_compile_spec.py @@ -203,6 +203,10 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec: assert isinstance(compile_spec["capability"], _enums.EngineCapability) info.capability = compile_spec["capability"] + if "num_min_timing_iters" in compile_spec: + assert type(compile_spec["num_min_timing_iters"]) is int + info.num_min_timing_iters = compile_spec["num_min_timing_iters"] + if "num_avg_timing_iters" in compile_spec: assert type(compile_spec["num_avg_timing_iters"]) is int info.num_avg_timing_iters = compile_spec["num_avg_timing_iters"] diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py index f4720287d6..c0e88b99ce 100644 --- a/py/torch_tensorrt/ts/_compiler.py +++ b/py/torch_tensorrt/ts/_compiler.py @@ -18,6 +18,7 @@ def compile(module: torch.jit.ScriptModule, refit=False, debug=False, capability=_enums.EngineCapability.default, + num_min_timing_iters=2, num_avg_timing_iters=1, workspace_size=0, calibrator=None, diff --git a/tests/py/test_to_backend_api.py b/tests/py/test_to_backend_api.py index adde021ab8..11c411ff56 100644 --- a/tests/py/test_to_backend_api.py +++ b/tests/py/test_to_backend_api.py @@ -26,6 +26,7 @@ def setUp(self): "allow_gpu_fallback": True }, "capability": torchtrt.EngineCapability.default, + "num_min_timing_iters": 2, "num_avg_timing_iters": 1, "disable_tf32": False, })