feat(//py): Initial compiliant implementation of the to_backend api for

PyTorch Users can now use a direct PyTorch integration by just importing the trtorch package. The only difference between torch._C._jit_to_tensorrt and trtorch.compile is that you need to use the trtorch.TensorRTCompileSpec constructor to build a wrapper around your spec dictionary Signed-off-by: Naren Dasan <naren@narendasan.com> Signed-off-by: Naren Dasan <narens@nvidia.com>
pytorch · Oct 21, 2020 · 59113cf · 59113cf
1 parent b24c0d8
commit 59113cf
Show file tree

Hide file tree

Showing 15 changed files with 573 additions and 133 deletions.
diff --git a/py/setup.py b/py/setup.py
@@ -156,7 +156,12 @@ def run(self):
 
 ext_modules = [
     cpp_extension.CUDAExtension('trtorch._C',
-                                ['trtorch/csrc/trtorch_py.cpp'],
+                                [
+                                    'trtorch/csrc/trtorch_py.cpp',
+                                    'trtorch/csrc/tensorrt_backend.cpp',
+                                    'trtorch/csrc/tensorrt_classes.cpp',
+                                    'trtorch/csrc/register_tensorrt_classes.cpp',
+                                ],
                                 library_dirs=[
                                     (dir_path + '/trtorch/lib/'),
                                     "/opt/conda/lib/python3.6/config-3.6m-x86_64-linux-gnu"
@@ -165,6 +170,7 @@ def run(self):
                                     "trtorch"
                                 ],
                                 include_dirs=[
+                                    dir_path + "trtorch/csrc",
                                     dir_path + "/../",
                                     dir_path + "/../bazel-TRTorch/external/tensorrt/include",
                                 ],

diff --git a/py/trtorch/__init__.py b/py/trtorch/__init__.py
@@ -9,6 +9,7 @@
 
 from trtorch._version import __version__
 from trtorch._compiler import *
+from trtorch._compile_spec import TensorRTCompileSpec
 from trtorch._types import *
 from trtorch import logging
 

diff --git a/py/trtorch/_compile_spec.py b/py/trtorch/_compile_spec.py
@@ -73,16 +73,21 @@ def _parse_op_precision(precision: Any) -> _types.dtype:
 
 def _parse_device_type(device: Any) -> _types.DeviceType:
     if isinstance(device, torch.device):
-        if torch.device.type == 'cuda':
+        if device.type == 'cuda':
             return _types.DeviceType.gpu
         else:
-            raise TypeError("Valid device choices are GPU (and DLA if on Jetson platforms) however got device type" + str(device.type))
-
+            ValueError("Got a device type other than GPU or DLA (type: " + str(device.type) + ")")
     elif isinstance(device, _types.DeviceType):
         return device
-
+    elif isinstance(device, str):
+        if device == "gpu" or device == "GPU":
+            return _types.DeviceType.gpu
+        elif device == "dla" or device == "DLA":
+            return _types.DeviceType.dla
+        else:
+            ValueError("Got a device type other than GPU or DLA (type: " + str(device) + ")")
     else:
-        raise TypeError("Device specification must be of type torch.device or trtorch.DeviceType, but got: " + str(type(device)))
+        raise TypeError("Device specification must be of type torch.device, string or trtorch.DeviceType, but got: " + str(type(device)))
 
 def _parse_compile_spec(compile_spec: Dict[str, Any]) -> trtorch._C.CompileSpec:
     info = trtorch._C.CompileSpec()
@@ -110,11 +115,11 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> trtorch._C.CompileSpec:
         assert isinstance(compile_spec["allow_gpu_fallback"], bool)
         info.allow_gpu_fallback = compile_spec["allow_gpu_fallback"]
 
-    if "device" in compile_spec:
-        info.device = _parse_device_type(compile_spec["device"])
+    if "device_type" in compile_spec:
+        info.device = _parse_device_type(compile_spec["device_type"])
 
     if "capability" in compile_spec:
-        assert isinstance(compile_spec["capability"], type.EngineCapability)
+        assert isinstance(compile_spec["capability"], _types.EngineCapability)
         info.capability = compile_spec["capability"]
 
     if "num_min_timing_iters" in compile_spec:
@@ -133,4 +138,74 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> trtorch._C.CompileSpec:
         assert type(compile_spec["max_batch_size"]) is int
         info.max_batch_size = compile_spec["max_batch_size"]
 
-    return info
+    return info
+
+def TensorRTCompileSpec(compile_spec: Dict[str, Any]):
+    """
+    Utility to create a formated spec dictionary for using the PyTorch TensorRT backend
+
+    Args:
+        compile_spec (dict): Compilation settings including operating precision, target device, etc.
+            One key is required which is ``input_shapes``, describing the input sizes or ranges for inputs
+            to the graph. All other keys are optional. Entries for each method to be compiled.
+
+            .. code-block:: py
+
+                CompileSpec = {
+                    "forward" : trtorch.TensorRTCompileSpec({
+                        "input_shapes": [
+                            (1, 3, 224, 224), # Static input shape for input #1
+                            {
+                                "min": (1, 3, 224, 224),
+                                "opt": (1, 3, 512, 512),
+                                "max": (1, 3, 1024, 1024)
+                            } # Dynamic input shape for input #2
+                        ],
+                        "op_precision": torch.half, # Operating precision set to FP16
+                        "refit": false, # enable refit
+                        "debug": false, # enable debuggable engine
+                        "strict_types": false, # kernels should strictly run in operating precision
+                        "allow_gpu_fallback": false, # (DLA only) Allow layers unsupported on DLA to run on GPU
+                        "device": torch.device("cuda"), # Type of device to run engine on (for DLA use trtorch.DeviceType.DLA)
+                        "capability": trtorch.EngineCapability.DEFAULT, # Restrict kernel selection to safe gpu kernels or safe dla kernels
+                        "num_min_timing_iters": 2, # Number of minimization timing iterations used to select kernels
+                        "num_avg_timing_iters": 1, # Number of averaging timing iterations used to select kernels
+                        "workspace_size": 0, # Maximum size of workspace given to TensorRT
+                        "max_batch_size": 0, # Maximum batch size (must be >= 1 to be set, 0 means not set)
+                    })
+                }
+
+            Input Sizes can be specified as torch sizes, tuples or lists. Op precisions can be specified using
+            torch datatypes or trtorch datatypes and you can use either torch devices or the trtorch device type enum
+            to select device type.
+
+    Returns:
+        torch.classes.tensorrt.CompileSpec: List of methods and formated spec objects to be provided to ``torch._C._jit_to_tensorrt``
+    """
+
+    parsed_spec = _parse_compile_spec(compile_spec)
+
+    backend_spec = torch.classes.tensorrt.CompileSpec()
+
+    for i in parsed_spec.input_ranges:
+        ir = torch.classes.tensorrt.InputRange()
+        ir.set_min(i.min)
+        ir.set_opt(i.opt)
+        ir.set_max(i.max)
+        backend_spec.append_input_range(ir)
+
+    backend_spec.set_op_precision(int(parsed_spec.op_precision))
+    backend_spec.set_refit(parsed_spec.refit)
+    backend_spec.set_debug(parsed_spec.debug)
+    backend_spec.set_refit(parsed_spec.refit)
+    backend_spec.set_strict_types(parsed_spec.strict_types)
+    backend_spec.set_allow_gpu_fallback(parsed_spec.allow_gpu_fallback)
+    backend_spec.set_device(int(parsed_spec.device))
+    backend_spec.set_capability(int(parsed_spec.capability))
+    backend_spec.set_num_min_timing_iters(parsed_spec.num_min_timing_iters)
+    backend_spec.set_num_avg_timing_iters(parsed_spec.num_avg_timing_iters)
+    backend_spec.set_workspace_size(parsed_spec.workspace_size)
+    backend_spec.set_max_batch_size(parsed_spec.max_batch_size)
+
+    return backend_spec
+
diff --git a/py/trtorch/_compiler.py b/py/trtorch/_compiler.py
@@ -39,7 +39,7 @@ def compile(module: torch.jit.ScriptModule, compile_spec: Any) -> torch.jit.Scri
                     "debug": false, # enable debuggable engine
                     "strict_types": false, # kernels should strictly run in operating precision
                     "allow_gpu_fallback": false, # (DLA only) Allow layers unsupported on DLA to run on GPU
-                    "device": torch.device("cuda"), # Type of device to run engine on (for DLA use trtorch.DeviceType.DLA)
+                    "device_type": torch.device("cuda"), # Type of device to run engine on (for DLA use trtorch.DeviceType.DLA)
                     "capability": trtorch.EngineCapability.DEFAULT, # Restrict kernel selection to safe gpu kernels or safe dla kernels
                     "num_min_timing_iters": 2, # Number of minimization timing iterations used to select kernels
                     "num_avg_timing_iters": 1, # Number of averaging timing iterations used to select kernels
@@ -91,7 +91,7 @@ def convert_method_to_trt_engine(module: torch.jit.ScriptModule, method_name: st
                     "debug": false, # enable debuggable engine
                     "strict_types": false, # kernels should strictly run in operating precision
                     "allow_gpu_fallback": false, # (DLA only) Allow layers unsupported on DLA to run on GPU
-                    "device": torch.device("cuda"), # Type of device to run engine on (for DLA use trtorch.DeviceType.DLA)
+                    "device_type": torch.device("cuda"), # Type of device to run engine on (for DLA use trtorch.DeviceType.DLA)
                     "capability": trtorch.EngineCapability.DEFAULT, # Restrict kernel selection to safe gpu kernels or safe dla kernels
                     "num_min_timing_iters": 2, # Number of minimization timing iterations used to select kernels
                     "num_avg_timing_iters": 1, # Number of averaging timing iterations used to select kernels

diff --git a/py/trtorch/csrc/register_tensorrt_classes.cpp b/py/trtorch/csrc/register_tensorrt_classes.cpp
@@ -0,0 +1,47 @@
+#include "tensorrt_classes.h"
+
+namespace trtorch {
+namespace backend {
+namespace {
+  void RegisterTRTCompileSpec() {
+    #define ADD_FIELD_GET_SET_REGISTRATION(registry, class_name, field_name) \
+      (registry).def("set_"#field_name, &class_name::set_##field_name);      \
+      (registry).def("get_"#field_name, &class_name::get_##field_name);
+
+    static auto TRTORCH_UNUSED TRTInputRangeTSRegistrtion = torch::class_<trtorch::pyapi::InputRange>("tensorrt", "InputRange")
+      .def(torch::init<>());
+
+    ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistrtion, trtorch::pyapi::InputRange, min);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistrtion, trtorch::pyapi::InputRange, opt);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistrtion, trtorch::pyapi::InputRange, max);
+
+    static auto TRTORCH_UNUSED TRTCompileSpecTSRegistrtion = torch::class_<trtorch::pyapi::CompileSpec>("tensorrt", "CompileSpec")
+      .def(torch::init<>())
+      .def("append_input_range", &trtorch::pyapi::CompileSpec::appendInputRange)
+      .def("__str__", &trtorch::pyapi::CompileSpec::stringify);
+
+    ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, op_precision);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, refit);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, debug);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, strict_types);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, allow_gpu_fallback);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, device);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, capability);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, num_min_timing_iters);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, num_avg_timing_iters);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, workspace_size);
+    ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, max_batch_size);
+  }
+
+struct TRTTSRegistrations {
+  TRTTSRegistrations() {
+    RegisterTRTCompileSpec();
+  }
+};
+
+static TRTTSRegistrations register_trt_classes = TRTTSRegistrations();
+}
+} // namespace backend
+} // namespace trtorch
+
+
diff --git a/py/trtorch/csrc/tensorrt_backend.cpp b/py/trtorch/csrc/tensorrt_backend.cpp
@@ -0,0 +1,86 @@
+#include "torch/csrc/jit/passes/lower_graph.h"
+
+#include "tensorrt_backend.h"
+#include "tensorrt_classes.h"
+
+#include "core/compiler.h"
+#include "core/lowering/lowering.h"
+#include "core/runtime/runtime.h"
+
+namespace trtorch {
+namespace backend {
+
+c10::IValue TensorRTBackend::preprocess(c10::IValue mod, c10::impl::GenericDict method_compile_spec) {
+  auto mod_ = mod.toModule();
+  LOG_DEBUG("Placing module in eval mode if not already");
+  mod_.eval();
+  mod_ = core::lowering::LowerModule(mod_);
+
+  auto spec =
+      c10::impl::toTypedDict<std::string, at::IValue>(method_compile_spec);
+
+  for (auto it = spec.begin(), end = spec.end(); it != end; ++it) {
+    TRTORCH_CHECK(core::CheckMethodOperatorSupport(mod.toModule(), it->key()),
+        "Method " << it->key() << "cannot be compiled by TRTorch");
+  }
+
+  for (auto it = spec.begin(), end = spec.end(); it != end; ++it) {
+    const auto& method_name = it->key();
+    auto method = mod_.get_method(method_name);
+    auto graph = method.graph();
+    core::lowering::LowerGraph(graph);
+  }
+
+  return mod_._ivalue();
+}
+
+c10::impl::GenericDict TensorRTBackend::compile(c10::IValue processed_mod, c10::impl::GenericDict method_compile_spec) {
+  auto mod = processed_mod.toModule();
+  auto spec =
+      c10::impl::toTypedDict<std::string, at::IValue>(method_compile_spec);
+
+  auto handles = c10::impl::GenericDict(c10::StringType::get(), c10::getCustomClassType<c10::intrusive_ptr<core::runtime::TRTEngine>>());
+
+  for (auto it = spec.begin(), end = spec.end(); it != end; ++it) {
+    const auto& method_name = it->key();
+    auto method = mod.get_method(method_name);
+    auto g = method.graph();
+
+    auto raw_spec = it->value().toGenericDict().at(it->key()).toCustomClass<trtorch::pyapi::CompileSpec>();
+    LOG_DEBUG(raw_spec->stringify());
+    auto cfg = raw_spec->toInternalCompileSpec();
+    auto convert_cfg = std::move(cfg.convert_info);
+    auto graph_and_ivalues = torch::jit::LowerGraph(*g, mod._ivalue());
+
+    g = graph_and_ivalues.first;
+    auto params = graph_and_ivalues.second;
+    auto named_params = core::conversion::get_named_params(g->inputs(), params);
+
+    auto serialized_engine = core::conversion::ConvertBlockToEngine(g->block(), convert_cfg, named_params);
+    auto engine_handle = c10::make_intrusive<core::runtime::TRTEngine>(it->key(), serialized_engine);
+    handles.insert(method.name(), at::IValue(engine_handle));
+  }
+
+  return c10::impl::toGenericDict(handles);
+}
+
+
+c10::impl::GenericList TensorRTBackend::execute(c10::IValue handle, c10::impl::GenericList inputs) {
+  TRTORCH_ASSERT(inputs.size() > 0, "Trying to execute on empty list of arguments");
+  auto engine = handle.toCustomClass<core::runtime::TRTEngine>();
+  std::vector<at::Tensor> in_vec;
+  for (size_t i = 0, e = inputs.size(); i < e; ++i) {
+    c10::IValue val = inputs[i];
+    TRTORCH_CHECK(val.isTensor(), "TensorRT currently only accepts Tensors as inputs");
+    in_vec.push_back(val.toTensor());
+  }
+  auto outputs = core::runtime::execute_engine(in_vec, engine);
+  return c10::impl::toList(c10::List<at::Tensor>(outputs));
+}
+
+namespace {
+static auto reg = torch::jit::backend<TensorRTBackend>("tensorrt");
+}
+
+} // namespace backend
+} // namespace trtorch
diff --git a/py/trtorch/csrc/tensorrt_backend.h b/py/trtorch/csrc/tensorrt_backend.h
@@ -0,0 +1,19 @@
+#pragma once
+#include "torch/csrc/jit/api/module.h"
+#include "torch/csrc/jit/backends/backend.h"
+
+namespace trtorch {
+namespace backend {
+
+class TensorRTBackend: public torch::jit::PyTorchBackendInterface {
+  public:
+    explicit TensorRTBackend() {}
+    virtual ~TensorRTBackend() = default;
+
+  c10::IValue preprocess(c10::IValue mod, c10::impl::GenericDict method_compile_spec) override;
+  c10::impl::GenericDict compile(c10::IValue processed_mod, c10::impl::GenericDict method_compile_spec) override;
+  c10::impl::GenericList execute(c10::IValue handle, c10::impl::GenericList inputs) override;
+};
+
+} // namespace backend
+} // namespace trtorch