From f4c29b43abd2703430672028ae35c5d335e442b1 Mon Sep 17 00:00:00 2001
From: Bo Wang <wangbo1995ee@163.com>
Date: Tue, 9 Mar 2021 22:49:00 -0600
Subject: [PATCH] feat: added user level API for fallback

Signed-off-by: Bo Wang <wangbo1995ee@163.com>
---
 core/compiler.cpp                             | 54 ++++++++++---------
 .../conversionctx/ConversionCtx.cpp           |  5 ++
 core/conversion/conversionctx/ConversionCtx.h |  7 +++
 core/partitioning/partitioning.cpp            |  4 --
 cpp/api/include/trtorch/trtorch.h             | 31 +++++++++++
 cpp/api/src/compile_spec.cpp                  |  3 ++
 6 files changed, 74 insertions(+), 30 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index 9d87a5cc31..91cb0f8281 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -156,29 +156,6 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
   return std::move(engine);
 }
 
-//torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, CompileSpec cfg) {
-//  // TODO: Should be doing a functional transform but need PR #31978
-//  // [jit] More robust mangling
-//  // torch::jit::script::Module new_mod = mod.clone();
-//  torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
-//  std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
-//  for (const torch::jit::script::Method& method : mod.get_methods()) {
-//    // Don't convert hidden methods
-//    if (method.name().rfind("_", 0)) {
-//      auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg);
-//      auto new_g = std::make_shared<torch::jit::Graph>();
-//      AddEngineToGraph(new_mod, new_g, engine);
-//      auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
-//      auto schema = GenerateGraphSchema(new_mod, new_method->name(), new_g);
-//      new_mod.type()->addMethod(new_method);
-//      new_method->setSchema(schema);
-//    }
-//  }
-//
-//  return new_mod;
-//}
-
-
 
 void AddSegmentedBlockToGraph(std::shared_ptr<torch::jit::Graph>& g, partitioning::SegmentedBlock &seg,
                               std::unordered_map<torch::jit::Value*, torch::jit::Value*> &old_to_new_g) {
@@ -198,7 +175,6 @@ void AddSegmentedBlockToGraph(std::shared_ptr<torch::jit::Graph>& g, partitionin
     }
   }
 
-  torch::jit::Node *node;
   for (const auto n : seg.nodes()) {
     partitioning::cloneNode(n, g, old_to_new_g);
   }
@@ -212,8 +188,7 @@ void AddSegmentedBlockToGraph(std::shared_ptr<torch::jit::Graph>& g, partitionin
   return;
 }
 
-
-torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, CompileSpec cfg) {
+torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Module& mod, CompileSpec cfg) {
   // TODO: Should be doing a functional transform but need PR #31978
   // [jit] More robust mangling
   // torch::jit::script::Module new_mod = mod.clone();
@@ -270,6 +245,33 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
   return new_mod;
 }
 
+
+torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, CompileSpec cfg) {
+  // TODO: not sure how to deal with duplicated code here, so just cut out a branch temporally
+  if (cfg.convert_info.engine_settings.torch_fallback.enabled) {
+    return CompileGraphWithFallback(mod, cfg);
+  }
+  // TODO: Should be doing a functional transform but need PR #31978
+  // [jit] More robust mangling
+  // torch::jit::script::Module new_mod = mod.clone();
+  torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
+  std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
+  for (const torch::jit::script::Method& method : mod.get_methods()) {
+    // Don't convert hidden methods
+    if (method.name().rfind("_", 0)) {
+      auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg);
+      auto new_g = std::make_shared<torch::jit::Graph>();
+      AddEngineToGraph(new_mod, new_g, engine);
+      auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
+      auto schema = GenerateGraphSchema(new_mod, new_method->name(), new_g);
+      new_mod.type()->addMethod(new_method);
+      new_method->setSchema(schema);
+    }
+  }
+
+  return new_mod;
+}
+
 void set_device(const int gpu_id) {
   TRTORCH_ASSERT(cudaSetDevice(gpu_id) == cudaSuccess, "Unable to set CUDA device: " << gpu_id);
 }
diff --git a/core/conversion/conversionctx/ConversionCtx.cpp b/core/conversion/conversionctx/ConversionCtx.cpp
index 04f6aafe5c..038ea3874c 100644
--- a/core/conversion/conversionctx/ConversionCtx.cpp
+++ b/core/conversion/conversionctx/ConversionCtx.cpp
@@ -36,6 +36,11 @@ std::ostream& operator<<(std::ostream& os, const BuilderSettings& s) {
     }
     os << "\n    Engine Capability: " << s.capability                                      \
        << "\n    Calibrator Created: " << (s.calibrator != nullptr);
+
+    os << "\n    Torch Fallback: " << s.torch_fallback.enabled;
+    if (s.torch_fallback.enabled) {
+      os << "\n    Fallback min block size: " << s.torch_fallback.min_block_size;
+    }
     return os;
 }
 // clang-format on
diff --git a/core/conversion/conversionctx/ConversionCtx.h b/core/conversion/conversionctx/ConversionCtx.h
index 444936cb8b..5588539c12 100644
--- a/core/conversion/conversionctx/ConversionCtx.h
+++ b/core/conversion/conversionctx/ConversionCtx.h
@@ -22,6 +22,12 @@ struct Device {
   Device() : device_type(nvinfer1::DeviceType::kGPU), gpu_id(0), dla_core(0), allow_gpu_fallback(false) {}
 };
 
+struct TorchFallback {
+  bool enabled = false;
+  uint64_t min_block_size = 1;
+  std::vector<std::string> forced_fallback_operators;
+};
+
 struct BuilderSettings {
   nvinfer1::DataType op_precision = nvinfer1::DataType::kFLOAT;
   bool disable_tf32 = false;
@@ -29,6 +35,7 @@ struct BuilderSettings {
   bool debug = false;
   bool strict_types = false;
   Device device;
+  TorchFallback torch_fallback;
   nvinfer1::EngineCapability capability = nvinfer1::EngineCapability::kDEFAULT;
   nvinfer1::IInt8Calibrator* calibrator = nullptr;
   uint64_t num_min_timing_iters = 2;
diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
index 17198b6faf..f9248d436a 100644
--- a/core/partitioning/partitioning.cpp
+++ b/core/partitioning/partitioning.cpp
@@ -124,10 +124,6 @@ void registerSegmentsInputsOutputs(std::vector<SegmentedBlock> &segmented_blocks
     }
   }
 
-//  for (auto &graph_input : g->inputs()) {
-//    input_values.erase(graph_input);
-//  }
-
   for (auto &graph_output : g->outputs()) {
     input_values.insert(graph_output);
   }
diff --git a/cpp/api/include/trtorch/trtorch.h b/cpp/api/include/trtorch/trtorch.h
index 4739d9199a..ca956d37d7 100644
--- a/cpp/api/include/trtorch/trtorch.h
+++ b/cpp/api/include/trtorch/trtorch.h
@@ -381,6 +381,37 @@ struct TRTORCH_API CompileSpec {
    */
   Device device;
 
+  /**
+   * @brief A struct to hold fallback info
+   */
+  struct TRTORCH_API TorchFallback {
+    /// enable the automatic fallback feature
+    bool enabled = false;
+
+    /// minimum consecutive operation number that needs to be satisfied to convert to TensorRT
+    uint64_t min_block_size = 1;
+
+    /// A list of names of operations that will explicitly run in PyTorch
+    std::vector<std::string> forced_fallback_operators;
+
+    /**
+     * @brief Construct a default Torch Fallback object, fallback will be off
+     */
+    TorchFallback() = default;
+
+     /**
+      * @brief Construct from a bool
+      */
+    TorchFallback(bool enabled) : enabled(enabled) {}
+
+    /**
+     * @brief Constructor for setting min_block_size
+     */
+     TorchFallback(bool enabled, uint64_t min_size) : enabled(enabled), min_block_size(min_size) {}
+  };
+
+  TorchFallback torch_fallback;
+
   /**
    * Sets the restrictions for the engine (CUDA Safety)
    */
diff --git a/cpp/api/src/compile_spec.cpp b/cpp/api/src/compile_spec.cpp
index 25dbda9c96..c1690a44aa 100644
--- a/cpp/api/src/compile_spec.cpp
+++ b/cpp/api/src/compile_spec.cpp
@@ -95,6 +95,9 @@ core::CompileSpec to_internal_compile_spec(CompileSpec external) {
   internal.convert_info.engine_settings.strict_types = external.strict_types;
   internal.convert_info.engine_settings.device.allow_gpu_fallback = external.device.allow_gpu_fallback;
   internal.convert_info.engine_settings.max_batch_size = external.max_batch_size;
+  internal.convert_info.engine_settings.torch_fallback.enabled = external.torch_fallback.enabled;
+  internal.convert_info.engine_settings.torch_fallback.min_block_size = external.torch_fallback.min_block_size;
+  internal.convert_info.engine_settings.torch_fallback.forced_fallback_operators = external.torch_fallback.forced_fallback_operators;
 
   switch (external.device.device_type) {
     case CompileSpec::Device::DeviceType::kDLA: