Skip to content

Commit

Permalink
feat(//py): Initial compiliant implementation of the to_backend api for
Browse files Browse the repository at this point in the history
PyTorch

Users can now use a direct PyTorch integration by just importing the
trtorch package. The only difference between torch._C._jit_to_tensorrt
and trtorch.compile is that you need to use the
trtorch.TensorRTCompileSpec constructor to build a wrapper around your
spec dictionary

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
  • Loading branch information
narendasan committed Oct 21, 2020
1 parent b24c0d8 commit 59113cf
Show file tree
Hide file tree
Showing 15 changed files with 573 additions and 133 deletions.
8 changes: 7 additions & 1 deletion py/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,12 @@ def run(self):

ext_modules = [
cpp_extension.CUDAExtension('trtorch._C',
['trtorch/csrc/trtorch_py.cpp'],
[
'trtorch/csrc/trtorch_py.cpp',
'trtorch/csrc/tensorrt_backend.cpp',
'trtorch/csrc/tensorrt_classes.cpp',
'trtorch/csrc/register_tensorrt_classes.cpp',
],
library_dirs=[
(dir_path + '/trtorch/lib/'),
"/opt/conda/lib/python3.6/config-3.6m-x86_64-linux-gnu"
Expand All @@ -165,6 +170,7 @@ def run(self):
"trtorch"
],
include_dirs=[
dir_path + "trtorch/csrc",
dir_path + "/../",
dir_path + "/../bazel-TRTorch/external/tensorrt/include",
],
Expand Down
1 change: 1 addition & 0 deletions py/trtorch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from trtorch._version import __version__
from trtorch._compiler import *
from trtorch._compile_spec import TensorRTCompileSpec
from trtorch._types import *
from trtorch import logging

Expand Down
93 changes: 84 additions & 9 deletions py/trtorch/_compile_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,21 @@ def _parse_op_precision(precision: Any) -> _types.dtype:

def _parse_device_type(device: Any) -> _types.DeviceType:
if isinstance(device, torch.device):
if torch.device.type == 'cuda':
if device.type == 'cuda':
return _types.DeviceType.gpu
else:
raise TypeError("Valid device choices are GPU (and DLA if on Jetson platforms) however got device type" + str(device.type))

ValueError("Got a device type other than GPU or DLA (type: " + str(device.type) + ")")
elif isinstance(device, _types.DeviceType):
return device

elif isinstance(device, str):
if device == "gpu" or device == "GPU":
return _types.DeviceType.gpu
elif device == "dla" or device == "DLA":
return _types.DeviceType.dla
else:
ValueError("Got a device type other than GPU or DLA (type: " + str(device) + ")")
else:
raise TypeError("Device specification must be of type torch.device or trtorch.DeviceType, but got: " + str(type(device)))
raise TypeError("Device specification must be of type torch.device, string or trtorch.DeviceType, but got: " + str(type(device)))

def _parse_compile_spec(compile_spec: Dict[str, Any]) -> trtorch._C.CompileSpec:
info = trtorch._C.CompileSpec()
Expand Down Expand Up @@ -110,11 +115,11 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> trtorch._C.CompileSpec:
assert isinstance(compile_spec["allow_gpu_fallback"], bool)
info.allow_gpu_fallback = compile_spec["allow_gpu_fallback"]

if "device" in compile_spec:
info.device = _parse_device_type(compile_spec["device"])
if "device_type" in compile_spec:
info.device = _parse_device_type(compile_spec["device_type"])

if "capability" in compile_spec:
assert isinstance(compile_spec["capability"], type.EngineCapability)
assert isinstance(compile_spec["capability"], _types.EngineCapability)
info.capability = compile_spec["capability"]

if "num_min_timing_iters" in compile_spec:
Expand All @@ -133,4 +138,74 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> trtorch._C.CompileSpec:
assert type(compile_spec["max_batch_size"]) is int
info.max_batch_size = compile_spec["max_batch_size"]

return info
return info

def TensorRTCompileSpec(compile_spec: Dict[str, Any]):
"""
Utility to create a formated spec dictionary for using the PyTorch TensorRT backend
Args:
compile_spec (dict): Compilation settings including operating precision, target device, etc.
One key is required which is ``input_shapes``, describing the input sizes or ranges for inputs
to the graph. All other keys are optional. Entries for each method to be compiled.
.. code-block:: py
CompileSpec = {
"forward" : trtorch.TensorRTCompileSpec({
"input_shapes": [
(1, 3, 224, 224), # Static input shape for input #1
{
"min": (1, 3, 224, 224),
"opt": (1, 3, 512, 512),
"max": (1, 3, 1024, 1024)
} # Dynamic input shape for input #2
],
"op_precision": torch.half, # Operating precision set to FP16
"refit": false, # enable refit
"debug": false, # enable debuggable engine
"strict_types": false, # kernels should strictly run in operating precision
"allow_gpu_fallback": false, # (DLA only) Allow layers unsupported on DLA to run on GPU
"device": torch.device("cuda"), # Type of device to run engine on (for DLA use trtorch.DeviceType.DLA)
"capability": trtorch.EngineCapability.DEFAULT, # Restrict kernel selection to safe gpu kernels or safe dla kernels
"num_min_timing_iters": 2, # Number of minimization timing iterations used to select kernels
"num_avg_timing_iters": 1, # Number of averaging timing iterations used to select kernels
"workspace_size": 0, # Maximum size of workspace given to TensorRT
"max_batch_size": 0, # Maximum batch size (must be >= 1 to be set, 0 means not set)
})
}
Input Sizes can be specified as torch sizes, tuples or lists. Op precisions can be specified using
torch datatypes or trtorch datatypes and you can use either torch devices or the trtorch device type enum
to select device type.
Returns:
torch.classes.tensorrt.CompileSpec: List of methods and formated spec objects to be provided to ``torch._C._jit_to_tensorrt``
"""

parsed_spec = _parse_compile_spec(compile_spec)

backend_spec = torch.classes.tensorrt.CompileSpec()

for i in parsed_spec.input_ranges:
ir = torch.classes.tensorrt.InputRange()
ir.set_min(i.min)
ir.set_opt(i.opt)
ir.set_max(i.max)
backend_spec.append_input_range(ir)

backend_spec.set_op_precision(int(parsed_spec.op_precision))
backend_spec.set_refit(parsed_spec.refit)
backend_spec.set_debug(parsed_spec.debug)
backend_spec.set_refit(parsed_spec.refit)
backend_spec.set_strict_types(parsed_spec.strict_types)
backend_spec.set_allow_gpu_fallback(parsed_spec.allow_gpu_fallback)
backend_spec.set_device(int(parsed_spec.device))
backend_spec.set_capability(int(parsed_spec.capability))
backend_spec.set_num_min_timing_iters(parsed_spec.num_min_timing_iters)
backend_spec.set_num_avg_timing_iters(parsed_spec.num_avg_timing_iters)
backend_spec.set_workspace_size(parsed_spec.workspace_size)
backend_spec.set_max_batch_size(parsed_spec.max_batch_size)

return backend_spec

4 changes: 2 additions & 2 deletions py/trtorch/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def compile(module: torch.jit.ScriptModule, compile_spec: Any) -> torch.jit.Scri
"debug": false, # enable debuggable engine
"strict_types": false, # kernels should strictly run in operating precision
"allow_gpu_fallback": false, # (DLA only) Allow layers unsupported on DLA to run on GPU
"device": torch.device("cuda"), # Type of device to run engine on (for DLA use trtorch.DeviceType.DLA)
"device_type": torch.device("cuda"), # Type of device to run engine on (for DLA use trtorch.DeviceType.DLA)
"capability": trtorch.EngineCapability.DEFAULT, # Restrict kernel selection to safe gpu kernels or safe dla kernels
"num_min_timing_iters": 2, # Number of minimization timing iterations used to select kernels
"num_avg_timing_iters": 1, # Number of averaging timing iterations used to select kernels
Expand Down Expand Up @@ -91,7 +91,7 @@ def convert_method_to_trt_engine(module: torch.jit.ScriptModule, method_name: st
"debug": false, # enable debuggable engine
"strict_types": false, # kernels should strictly run in operating precision
"allow_gpu_fallback": false, # (DLA only) Allow layers unsupported on DLA to run on GPU
"device": torch.device("cuda"), # Type of device to run engine on (for DLA use trtorch.DeviceType.DLA)
"device_type": torch.device("cuda"), # Type of device to run engine on (for DLA use trtorch.DeviceType.DLA)
"capability": trtorch.EngineCapability.DEFAULT, # Restrict kernel selection to safe gpu kernels or safe dla kernels
"num_min_timing_iters": 2, # Number of minimization timing iterations used to select kernels
"num_avg_timing_iters": 1, # Number of averaging timing iterations used to select kernels
Expand Down
47 changes: 47 additions & 0 deletions py/trtorch/csrc/register_tensorrt_classes.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#include "tensorrt_classes.h"

namespace trtorch {
namespace backend {
namespace {
void RegisterTRTCompileSpec() {
#define ADD_FIELD_GET_SET_REGISTRATION(registry, class_name, field_name) \
(registry).def("set_"#field_name, &class_name::set_##field_name); \
(registry).def("get_"#field_name, &class_name::get_##field_name);

static auto TRTORCH_UNUSED TRTInputRangeTSRegistrtion = torch::class_<trtorch::pyapi::InputRange>("tensorrt", "InputRange")
.def(torch::init<>());

ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistrtion, trtorch::pyapi::InputRange, min);
ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistrtion, trtorch::pyapi::InputRange, opt);
ADD_FIELD_GET_SET_REGISTRATION(TRTInputRangeTSRegistrtion, trtorch::pyapi::InputRange, max);

static auto TRTORCH_UNUSED TRTCompileSpecTSRegistrtion = torch::class_<trtorch::pyapi::CompileSpec>("tensorrt", "CompileSpec")
.def(torch::init<>())
.def("append_input_range", &trtorch::pyapi::CompileSpec::appendInputRange)
.def("__str__", &trtorch::pyapi::CompileSpec::stringify);

ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, op_precision);
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, refit);
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, debug);
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, strict_types);
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, allow_gpu_fallback);
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, device);
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, capability);
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, num_min_timing_iters);
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, num_avg_timing_iters);
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, workspace_size);
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistrtion, trtorch::pyapi::CompileSpec, max_batch_size);
}

struct TRTTSRegistrations {
TRTTSRegistrations() {
RegisterTRTCompileSpec();
}
};

static TRTTSRegistrations register_trt_classes = TRTTSRegistrations();
}
} // namespace backend
} // namespace trtorch


86 changes: 86 additions & 0 deletions py/trtorch/csrc/tensorrt_backend.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#include "torch/csrc/jit/passes/lower_graph.h"

#include "tensorrt_backend.h"
#include "tensorrt_classes.h"

#include "core/compiler.h"
#include "core/lowering/lowering.h"
#include "core/runtime/runtime.h"

namespace trtorch {
namespace backend {

c10::IValue TensorRTBackend::preprocess(c10::IValue mod, c10::impl::GenericDict method_compile_spec) {
auto mod_ = mod.toModule();
LOG_DEBUG("Placing module in eval mode if not already");
mod_.eval();
mod_ = core::lowering::LowerModule(mod_);

auto spec =
c10::impl::toTypedDict<std::string, at::IValue>(method_compile_spec);

for (auto it = spec.begin(), end = spec.end(); it != end; ++it) {
TRTORCH_CHECK(core::CheckMethodOperatorSupport(mod.toModule(), it->key()),
"Method " << it->key() << "cannot be compiled by TRTorch");
}

for (auto it = spec.begin(), end = spec.end(); it != end; ++it) {
const auto& method_name = it->key();
auto method = mod_.get_method(method_name);
auto graph = method.graph();
core::lowering::LowerGraph(graph);
}

return mod_._ivalue();
}

c10::impl::GenericDict TensorRTBackend::compile(c10::IValue processed_mod, c10::impl::GenericDict method_compile_spec) {
auto mod = processed_mod.toModule();
auto spec =
c10::impl::toTypedDict<std::string, at::IValue>(method_compile_spec);

auto handles = c10::impl::GenericDict(c10::StringType::get(), c10::getCustomClassType<c10::intrusive_ptr<core::runtime::TRTEngine>>());

for (auto it = spec.begin(), end = spec.end(); it != end; ++it) {
const auto& method_name = it->key();
auto method = mod.get_method(method_name);
auto g = method.graph();

auto raw_spec = it->value().toGenericDict().at(it->key()).toCustomClass<trtorch::pyapi::CompileSpec>();
LOG_DEBUG(raw_spec->stringify());
auto cfg = raw_spec->toInternalCompileSpec();
auto convert_cfg = std::move(cfg.convert_info);
auto graph_and_ivalues = torch::jit::LowerGraph(*g, mod._ivalue());

g = graph_and_ivalues.first;
auto params = graph_and_ivalues.second;
auto named_params = core::conversion::get_named_params(g->inputs(), params);

auto serialized_engine = core::conversion::ConvertBlockToEngine(g->block(), convert_cfg, named_params);
auto engine_handle = c10::make_intrusive<core::runtime::TRTEngine>(it->key(), serialized_engine);
handles.insert(method.name(), at::IValue(engine_handle));
}

return c10::impl::toGenericDict(handles);
}


c10::impl::GenericList TensorRTBackend::execute(c10::IValue handle, c10::impl::GenericList inputs) {
TRTORCH_ASSERT(inputs.size() > 0, "Trying to execute on empty list of arguments");
auto engine = handle.toCustomClass<core::runtime::TRTEngine>();
std::vector<at::Tensor> in_vec;
for (size_t i = 0, e = inputs.size(); i < e; ++i) {
c10::IValue val = inputs[i];
TRTORCH_CHECK(val.isTensor(), "TensorRT currently only accepts Tensors as inputs");
in_vec.push_back(val.toTensor());
}
auto outputs = core::runtime::execute_engine(in_vec, engine);
return c10::impl::toList(c10::List<at::Tensor>(outputs));
}

namespace {
static auto reg = torch::jit::backend<TensorRTBackend>("tensorrt");
}

} // namespace backend
} // namespace trtorch
19 changes: 19 additions & 0 deletions py/trtorch/csrc/tensorrt_backend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma once
#include "torch/csrc/jit/api/module.h"
#include "torch/csrc/jit/backends/backend.h"

namespace trtorch {
namespace backend {

class TensorRTBackend: public torch::jit::PyTorchBackendInterface {
public:
explicit TensorRTBackend() {}
virtual ~TensorRTBackend() = default;

c10::IValue preprocess(c10::IValue mod, c10::impl::GenericDict method_compile_spec) override;
c10::impl::GenericDict compile(c10::IValue processed_mod, c10::impl::GenericDict method_compile_spec) override;
c10::impl::GenericList execute(c10::IValue handle, c10::impl::GenericList inputs) override;
};

} // namespace backend
} // namespace trtorch
Loading

0 comments on commit 59113cf

Please sign in to comment.