From f4c29b43abd2703430672028ae35c5d335e442b1 Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Tue, 9 Mar 2021 22:49:00 -0600 Subject: [PATCH] feat: added user level API for fallback Signed-off-by: Bo Wang --- core/compiler.cpp | 54 ++++++++++--------- .../conversionctx/ConversionCtx.cpp | 5 ++ core/conversion/conversionctx/ConversionCtx.h | 7 +++ core/partitioning/partitioning.cpp | 4 -- cpp/api/include/trtorch/trtorch.h | 31 +++++++++++ cpp/api/src/compile_spec.cpp | 3 ++ 6 files changed, 74 insertions(+), 30 deletions(-) diff --git a/core/compiler.cpp b/core/compiler.cpp index 9d87a5cc31..91cb0f8281 100644 --- a/core/compiler.cpp +++ b/core/compiler.cpp @@ -156,29 +156,6 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std:: return std::move(engine); } -//torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, CompileSpec cfg) { -// // TODO: Should be doing a functional transform but need PR #31978 -// // [jit] More robust mangling -// // torch::jit::script::Module new_mod = mod.clone(); -// torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt"); -// std::vector> graphs; -// for (const torch::jit::script::Method& method : mod.get_methods()) { -// // Don't convert hidden methods -// if (method.name().rfind("_", 0)) { -// auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg); -// auto new_g = std::make_shared(); -// AddEngineToGraph(new_mod, new_g, engine); -// auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g); -// auto schema = GenerateGraphSchema(new_mod, new_method->name(), new_g); -// new_mod.type()->addMethod(new_method); -// new_method->setSchema(schema); -// } -// } -// -// return new_mod; -//} - - void AddSegmentedBlockToGraph(std::shared_ptr& g, partitioning::SegmentedBlock &seg, std::unordered_map &old_to_new_g) { @@ -198,7 +175,6 @@ void AddSegmentedBlockToGraph(std::shared_ptr& g, partitionin } } - torch::jit::Node *node; for (const auto n : seg.nodes()) { partitioning::cloneNode(n, g, old_to_new_g); } @@ -212,8 +188,7 @@ void AddSegmentedBlockToGraph(std::shared_ptr& g, partitionin return; } - -torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, CompileSpec cfg) { +torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Module& mod, CompileSpec cfg) { // TODO: Should be doing a functional transform but need PR #31978 // [jit] More robust mangling // torch::jit::script::Module new_mod = mod.clone(); @@ -270,6 +245,33 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C return new_mod; } + +torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, CompileSpec cfg) { + // TODO: not sure how to deal with duplicated code here, so just cut out a branch temporally + if (cfg.convert_info.engine_settings.torch_fallback.enabled) { + return CompileGraphWithFallback(mod, cfg); + } + // TODO: Should be doing a functional transform but need PR #31978 + // [jit] More robust mangling + // torch::jit::script::Module new_mod = mod.clone(); + torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt"); + std::vector> graphs; + for (const torch::jit::script::Method& method : mod.get_methods()) { + // Don't convert hidden methods + if (method.name().rfind("_", 0)) { + auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg); + auto new_g = std::make_shared(); + AddEngineToGraph(new_mod, new_g, engine); + auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g); + auto schema = GenerateGraphSchema(new_mod, new_method->name(), new_g); + new_mod.type()->addMethod(new_method); + new_method->setSchema(schema); + } + } + + return new_mod; +} + void set_device(const int gpu_id) { TRTORCH_ASSERT(cudaSetDevice(gpu_id) == cudaSuccess, "Unable to set CUDA device: " << gpu_id); } diff --git a/core/conversion/conversionctx/ConversionCtx.cpp b/core/conversion/conversionctx/ConversionCtx.cpp index 04f6aafe5c..038ea3874c 100644 --- a/core/conversion/conversionctx/ConversionCtx.cpp +++ b/core/conversion/conversionctx/ConversionCtx.cpp @@ -36,6 +36,11 @@ std::ostream& operator<<(std::ostream& os, const BuilderSettings& s) { } os << "\n Engine Capability: " << s.capability \ << "\n Calibrator Created: " << (s.calibrator != nullptr); + + os << "\n Torch Fallback: " << s.torch_fallback.enabled; + if (s.torch_fallback.enabled) { + os << "\n Fallback min block size: " << s.torch_fallback.min_block_size; + } return os; } // clang-format on diff --git a/core/conversion/conversionctx/ConversionCtx.h b/core/conversion/conversionctx/ConversionCtx.h index 444936cb8b..5588539c12 100644 --- a/core/conversion/conversionctx/ConversionCtx.h +++ b/core/conversion/conversionctx/ConversionCtx.h @@ -22,6 +22,12 @@ struct Device { Device() : device_type(nvinfer1::DeviceType::kGPU), gpu_id(0), dla_core(0), allow_gpu_fallback(false) {} }; +struct TorchFallback { + bool enabled = false; + uint64_t min_block_size = 1; + std::vector forced_fallback_operators; +}; + struct BuilderSettings { nvinfer1::DataType op_precision = nvinfer1::DataType::kFLOAT; bool disable_tf32 = false; @@ -29,6 +35,7 @@ struct BuilderSettings { bool debug = false; bool strict_types = false; Device device; + TorchFallback torch_fallback; nvinfer1::EngineCapability capability = nvinfer1::EngineCapability::kDEFAULT; nvinfer1::IInt8Calibrator* calibrator = nullptr; uint64_t num_min_timing_iters = 2; diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp index 17198b6faf..f9248d436a 100644 --- a/core/partitioning/partitioning.cpp +++ b/core/partitioning/partitioning.cpp @@ -124,10 +124,6 @@ void registerSegmentsInputsOutputs(std::vector &segmented_blocks } } -// for (auto &graph_input : g->inputs()) { -// input_values.erase(graph_input); -// } - for (auto &graph_output : g->outputs()) { input_values.insert(graph_output); } diff --git a/cpp/api/include/trtorch/trtorch.h b/cpp/api/include/trtorch/trtorch.h index 4739d9199a..ca956d37d7 100644 --- a/cpp/api/include/trtorch/trtorch.h +++ b/cpp/api/include/trtorch/trtorch.h @@ -381,6 +381,37 @@ struct TRTORCH_API CompileSpec { */ Device device; + /** + * @brief A struct to hold fallback info + */ + struct TRTORCH_API TorchFallback { + /// enable the automatic fallback feature + bool enabled = false; + + /// minimum consecutive operation number that needs to be satisfied to convert to TensorRT + uint64_t min_block_size = 1; + + /// A list of names of operations that will explicitly run in PyTorch + std::vector forced_fallback_operators; + + /** + * @brief Construct a default Torch Fallback object, fallback will be off + */ + TorchFallback() = default; + + /** + * @brief Construct from a bool + */ + TorchFallback(bool enabled) : enabled(enabled) {} + + /** + * @brief Constructor for setting min_block_size + */ + TorchFallback(bool enabled, uint64_t min_size) : enabled(enabled), min_block_size(min_size) {} + }; + + TorchFallback torch_fallback; + /** * Sets the restrictions for the engine (CUDA Safety) */ diff --git a/cpp/api/src/compile_spec.cpp b/cpp/api/src/compile_spec.cpp index 25dbda9c96..c1690a44aa 100644 --- a/cpp/api/src/compile_spec.cpp +++ b/cpp/api/src/compile_spec.cpp @@ -95,6 +95,9 @@ core::CompileSpec to_internal_compile_spec(CompileSpec external) { internal.convert_info.engine_settings.strict_types = external.strict_types; internal.convert_info.engine_settings.device.allow_gpu_fallback = external.device.allow_gpu_fallback; internal.convert_info.engine_settings.max_batch_size = external.max_batch_size; + internal.convert_info.engine_settings.torch_fallback.enabled = external.torch_fallback.enabled; + internal.convert_info.engine_settings.torch_fallback.min_block_size = external.torch_fallback.min_block_size; + internal.convert_info.engine_settings.torch_fallback.forced_fallback_operators = external.torch_fallback.forced_fallback_operators; switch (external.device.device_type) { case CompileSpec::Device::DeviceType::kDLA: