diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py index 24c468fee0fe..739d49c412e8 100644 --- a/python/tvm/relay/op/contrib/tensorrt.py +++ b/python/tvm/relay/op/contrib/tensorrt.py @@ -22,7 +22,7 @@ from tvm import relay from tvm.relay import transform from tvm.relay.build_module import bind_params_by_name -from tvm.relay.expr import Call, Constant, Tuple, GlobalVar +from tvm.relay.expr import Call, Constant, Tuple, GlobalVar, Var, TupleGetItem from tvm.relay.expr_functor import ExprMutator logger = logging.getLogger("TensorRT") @@ -155,10 +155,46 @@ def partition_for_tensorrt( return mod, config +def check_dynamism(args, op_name): + """ + Check for dynamism inside any of the args in the op. + + Parameters + ---------- + args : tvm.ir.container.Array + Arguments of the op. Each of the argument shape is checked for presence of dynamic + components. + op_name: str + Name of the op for debugging purposes only. + Returns + ---------- + ret : bool + True if dynamism is present, False otherwise + """ + for arg in args: + if isinstance(arg, (Call, Var, Constant, TupleGetItem)): + for dim_shape in arg.checked_type.shape: + if isinstance(dim_shape, tvm.tir.expr.Any): + return True + elif isinstance(arg, Tuple): + return check_dynamism(arg.fields, op_name) + else: + logger.info( + "Arg not supported in TensorRT for %s with type %s", + op_name, + type(arg), + ) + return True + return False + + def _register_external_op_helper_with_checker(op_name, checker): @tvm.ir.register_op_attr(op_name, "target.tensorrt") def _func_wrapper(expr): attrs, args = expr.attrs, expr.args + # ops with dynamic shapes are offloaded to VM + if check_dynamism(args, op_name): + return False if any([x.checked_type.dtype != "float32" for x in args]): logger.info("Only float32 inputs are supported for TensorRT.") return False @@ -173,6 +209,23 @@ def _register_external_op_helper(op_name, supported=True): ) +def _register_external_dynamic_check_func(op_name): + """Wrapper to check dynamic shapes inside any of the args in the op.""" + + def _decorator_helper(checker): + @tvm.ir.register_op_attr(op_name, "target.tensorrt") + def _func_wrapper(expr): + args = expr.args + # ops with dynamic shapes are offloaded to VM + if check_dynamism(args, op_name): + return False + return checker(expr) + + return _func_wrapper + + return _decorator_helper + + # Ops which are always supported _register_external_op_helper("nn.relu") _register_external_op_helper("sigmoid") @@ -192,7 +245,49 @@ def _register_external_op_helper(op_name, supported=True): _register_external_op_helper("clip") -@tvm.ir.register_op_attr("add", "target.tensorrt") +def reduce_annotate_fn(attrs, args, op_name): + """Helper for reduce operations.""" + if not attrs.axis or len(attrs.axis) == 0: + logger.info("%s: cannot reduce to scalar.", op_name) + return False + if attrs.exclude: + logger.info("%s: exclude not supported.", op_name) + return False + if get_tensorrt_use_implicit_batch_mode() and any([x == 0 for x in map(int, attrs.axis)]): + logger.info("%s: can't modify batch dimension.", op_name) + return False + return True + + +_register_external_op_helper_with_checker("sum", reduce_annotate_fn) +_register_external_op_helper_with_checker("prod", reduce_annotate_fn) +_register_external_op_helper_with_checker("max", reduce_annotate_fn) +_register_external_op_helper_with_checker("min", reduce_annotate_fn) +_register_external_op_helper_with_checker("mean", reduce_annotate_fn) + + +def trt_version_annotate_fn(version): + """Helper for ops which require a minimum TRT version""" + + def _func_wrapper(attrs, args, op_name): + if get_tensorrt_version() < version: + logger.info( + "%s: requires TensorRT version %s or higher.", op_name, ".".join(map(str, version)) + ) + return False + return True + + return _func_wrapper + + +_register_external_op_helper_with_checker("nn.leaky_relu", trt_version_annotate_fn((5, 1, 5))) +_register_external_op_helper_with_checker("sin", trt_version_annotate_fn((5, 1, 5))) +_register_external_op_helper_with_checker("cos", trt_version_annotate_fn((5, 1, 5))) +_register_external_op_helper_with_checker("atan", trt_version_annotate_fn((5, 1, 5))) +_register_external_op_helper_with_checker("ceil", trt_version_annotate_fn((5, 1, 5))) + + +@_register_external_dynamic_check_func("add") def add_annotate_fn(expr): # pylint: disable=unused-variable """Check if add is supported by TensorRT.""" @@ -212,7 +307,7 @@ def add_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.batch_norm", "target.tensorrt") +@_register_external_dynamic_check_func("nn.batch_norm") def batch_norm_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.batch_norm is supported by TensorRT.""" @@ -226,7 +321,7 @@ def batch_norm_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.softmax", "target.tensorrt") +@_register_external_dynamic_check_func("nn.softmax") def softmax_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.softmax is supported by TensorRT.""" @@ -240,7 +335,7 @@ def softmax_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.conv2d", "target.tensorrt") +@_register_external_dynamic_check_func("nn.conv2d") def conv2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.conv2d is supported by TensorRT.""" @@ -260,7 +355,7 @@ def conv2d_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.dense", "target.tensorrt") +@_register_external_dynamic_check_func("nn.dense") def dense_annotate_fn(expr): # pylint: disable=unused-variable """Check if dense is supported by TensorRT.""" @@ -279,7 +374,7 @@ def dense_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.bias_add", "target.tensorrt") +@_register_external_dynamic_check_func("nn.bias_add") def bias_add_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.bias_add is supported by TensorRT.""" @@ -294,7 +389,7 @@ def bias_add_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.max_pool2d", "target.tensorrt") +@_register_external_dynamic_check_func("nn.max_pool2d") def max_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.max_pool2d is supported by TensorRT.""" @@ -311,7 +406,7 @@ def max_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.avg_pool2d", "target.tensorrt") +@_register_external_dynamic_check_func("nn.avg_pool2d") def avg_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.avg_pool2d is supported by TensorRT.""" @@ -341,7 +436,7 @@ def avg_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.global_max_pool2d", "target.tensorrt") +@_register_external_dynamic_check_func("nn.global_max_pool2d") def global_max_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.global_max_pool2d is supported by TensorRT.""" @@ -355,7 +450,7 @@ def global_max_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.tensorrt") +@_register_external_dynamic_check_func("nn.global_avg_pool2d") def global_avg_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.global_avg_pool2d is supported by TensorRT.""" @@ -369,7 +464,7 @@ def global_avg_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("expand_dims", "target.tensorrt") +@_register_external_dynamic_check_func("expand_dims") def expand_dims_annotate_fn(expr): # pylint: disable=unused-variable """Check if expand_dims is supported by TensorRT.""" @@ -383,7 +478,7 @@ def expand_dims_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("squeeze", "target.tensorrt") +@_register_external_dynamic_check_func("squeeze") def squeeze_annotate_fn(expr): # pylint: disable=unused-variable """Check if squeeze is supported by TensorRT.""" @@ -400,7 +495,7 @@ def squeeze_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("concatenate", "target.tensorrt") +@_register_external_dynamic_check_func("concatenate") def concatenate_annotate_fn(expr): # pylint: disable=unused-variable """Check if concatenate is supported by TensorRT.""" @@ -421,7 +516,7 @@ def concatenate_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.conv2d_transpose", "target.tensorrt") +@_register_external_dynamic_check_func("nn.conv2d_transpose") def conv2d_transpose_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.conv2d_transpose is supported by TensorRT.""" @@ -446,7 +541,7 @@ def conv2d_transpose_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("transpose", "target.tensorrt") +@_register_external_dynamic_check_func("transpose") def transpose_annotate_fn(expr): # pylint: disable=unused-variable """Check if transpose is supported by TensorRT.""" @@ -460,7 +555,7 @@ def transpose_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("layout_transform", "target.tensorrt") +@_register_external_dynamic_check_func("layout_transform") def layout_transform_annotate_fn(expr): # pylint: disable=unused-variable """Check if layout_transform is supported by TensorRT.""" @@ -481,7 +576,7 @@ def layout_transform_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("reshape", "target.tensorrt") +@_register_external_dynamic_check_func("reshape") def reshape_annotate_fn(expr): # pylint: disable=unused-variable """Check if reshape is supported by TensorRT.""" @@ -514,7 +609,7 @@ def reshape_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.pad", "target.tensorrt") +@_register_external_dynamic_check_func("nn.pad") def pad_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.pad is supported by TensorRT.""" @@ -536,49 +631,7 @@ def pad_annotate_fn(expr): # pylint: disable=unused-variable return True -def reduce_annotate_fn(attrs, args, op_name): - """Helper for reduce operations.""" - if not attrs.axis or len(attrs.axis) == 0: - logger.info("%s: cannot reduce to scalar.", op_name) - return False - if attrs.exclude: - logger.info("%s: exclude not supported.", op_name) - return False - if get_tensorrt_use_implicit_batch_mode() and any([x == 0 for x in map(int, attrs.axis)]): - logger.info("%s: can't modify batch dimension.", op_name) - return False - return True - - -_register_external_op_helper_with_checker("sum", reduce_annotate_fn) -_register_external_op_helper_with_checker("prod", reduce_annotate_fn) -_register_external_op_helper_with_checker("max", reduce_annotate_fn) -_register_external_op_helper_with_checker("min", reduce_annotate_fn) -_register_external_op_helper_with_checker("mean", reduce_annotate_fn) - - -def trt_version_annotate_fn(version): - """Helper for ops which require a minimum TRT version""" - - def _func_wrapper(attrs, args, op_name): - if get_tensorrt_version() < version: - logger.info( - "%s: requires TensorRT version %s or higher.", op_name, ".".join(map(str, version)) - ) - return False - return True - - return _func_wrapper - - -_register_external_op_helper_with_checker("nn.leaky_relu", trt_version_annotate_fn((5, 1, 5))) -_register_external_op_helper_with_checker("sin", trt_version_annotate_fn((5, 1, 5))) -_register_external_op_helper_with_checker("cos", trt_version_annotate_fn((5, 1, 5))) -_register_external_op_helper_with_checker("atan", trt_version_annotate_fn((5, 1, 5))) -_register_external_op_helper_with_checker("ceil", trt_version_annotate_fn((5, 1, 5))) - - -@tvm.ir.register_op_attr("strided_slice", "target.tensorrt") +@_register_external_dynamic_check_func("strided_slice") def strided_slice_annotate_fn(expr): # pylint: disable=unused-variable """Check if strided_slice is supported by TensorRT.""" @@ -601,11 +654,33 @@ def strided_slice_annotate_fn(expr): # pylint: disable=unused-variable if any([x is not None and x <= 0 for x in attrs.strides]): logger.info("strided_slice: stride must be positive") return False + for i in range(0, len(args[0].checked_type.shape)): + begin = int(attrs.begin[i]) + if attrs.slice_mode == "end": + end = ( + int(attrs.end[i]) + if attrs.end[i] is not None and int(attrs.end[i]) != -1 + else args[0].checked_type.shape[i] + ) + size = int(end) - int(begin) + elif attrs.slice_mode == "size": + size = ( + int(attrs.end[i]) + if attrs.end[i] is not None and int(attrs.end[i]) != -1 + else args[0].checked_type.shape[i] - begin + ) + else: + logger.warning("strided_slice: unknown slice mode encountered") + + if int(size) < 1: + logger.info("strided_slice: size of slice must be at least 1") + return False + return True -@tvm.ir.register_op_attr("nn.adaptive_max_pool2d", "target.tensorrt") -def adapative_max_pool2d_annotate_fn(expr): # pylint: disable=unused-variable +@_register_external_dynamic_check_func("nn.adaptive_max_pool2d") +def adaptive_max_pool2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.adaptive_max_pool2d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args @@ -618,8 +693,8 @@ def adapative_max_pool2d_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.adaptive_avg_pool2d", "target.tensorrt") -def adapative_avg_pool2d_annotate_fn(expr): # pylint: disable=unused-variable +@_register_external_dynamic_check_func("nn.adaptive_avg_pool2d") +def adaptive_avg_pool2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.adaptive_avg_pool2d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args @@ -632,7 +707,7 @@ def adapative_avg_pool2d_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.conv3d", "target.tensorrt") +@_register_external_dynamic_check_func("nn.conv3d") def conv3d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.conv3d is supported by TensorRT.""" @@ -654,7 +729,7 @@ def conv3d_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.max_pool3d", "target.tensorrt") +@_register_external_dynamic_check_func("nn.max_pool3d") def max_pool_3d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.max_pool3d is supported by TensorRT.""" @@ -670,7 +745,7 @@ def max_pool_3d_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.avg_pool3d", "target.tensorrt") +@_register_external_dynamic_check_func("nn.avg_pool3d") def avg_pool_3d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.avg_pool3d is supported by TensorRT.""" @@ -686,7 +761,7 @@ def avg_pool_3d_annotate_fn(expr): # pylint: disable=unused-variable return True -@tvm.ir.register_op_attr("nn.conv3d_transpose", "target.tensorrt") +@_register_external_dynamic_check_func("nn.conv3d_transpose") def conv3d_transpose_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.conv3d_transpose is supported by TensorRT.""" @@ -774,13 +849,10 @@ def visit_call(self, call): new_body = relay.bind(func.body, var_map) return new_body if name != "main": - # Copy the GlobalVar (subgraph function) to the new module and call. args = [] for arg in call.args: args.append(super().visit(arg)) - subgraph_gv = relay.GlobalVar(name) - self.new_mod[subgraph_gv] = self.mod[name] - return subgraph_gv(*args) + return call.op(*args) return super().visit_call(call) subgraphs_to_remove = [] @@ -792,7 +864,7 @@ def visit_call(self, call): if not is_valid_subgraph(mod[name].params, mod[name].body): subgraphs_to_remove.append(name) # Create new pruned module - new_mod = tvm.IRModule() + new_mod = tvm.IRModule(mod.functions, mod.type_definitions) new_mod["main"] = SubgraphRemover(subgraphs_to_remove, mod, new_mod).visit(mod["main"]) return new_mod diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc index 26f674dcd7b5..cb648333df8d 100644 --- a/src/relay/backend/contrib/tensorrt/codegen.cc +++ b/src/relay/backend/contrib/tensorrt/codegen.cc @@ -140,18 +140,25 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer { std::vector start, size, strides; for (size_t i = 0; i < attrs->begin.value().size(); ++i) { const int begin_value = process_slice_index(attrs->begin.value()[i], 0, ishape[i]); - const int end_value = process_slice_index(attrs->end.value()[i], ishape[i], ishape[i]); + ICHECK_GE(begin_value, 0); + start.push_back(std::to_string(begin_value)); const int stride_value = (default_strides || i >= attrs->strides.value().size() || !attrs->strides.value()[i].defined()) ? 1 : attrs->strides.value()[i].as()->value; ICHECK_GT(stride_value, 0); - const int size_value = (end_value - begin_value + stride_value - 1) / stride_value; - ICHECK_GE(begin_value, 0); + strides.push_back(std::to_string(stride_value)); + int size_value; + if (attrs->slice_mode == "end") { + const int end_value = process_slice_index(attrs->end.value()[i], ishape[i], ishape[i]); + size_value = (end_value - begin_value + stride_value - 1) / stride_value; + } else if (attrs->slice_mode == "size") { + // with slice_mode = "size", attrs->end_value mean the size of the slice + int end_value = attrs->end.value()[i].as()->value; + size_value = (end_value == -1) ? ishape[i] - begin_value : end_value; + } ICHECK_GT(size_value, 0); - start.push_back(std::to_string(begin_value)); size.push_back(std::to_string(size_value)); - strides.push_back(std::to_string(stride_value)); } std::vector start_attr, size_attr, strides_attr; start_attr.emplace_back(start); diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc index a86f107941bc..057743c3b588 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc @@ -944,7 +944,7 @@ class ReduceOpConverter : public TensorRTOpConverter { #if TRT_VERSION_GE(5, 1, 5) class StridedSliceOpConverter : public TensorRTOpConverter { public: - StridedSliceOpConverter() : TensorRTOpConverter({kTensor, kWeight, kWeight, kWeight}) {} + StridedSliceOpConverter() : TensorRTOpConverter({kTensor}) {} void Convert(TensorRTOpConverterParams* params) const { auto input = params->inputs.at(0).tensor; diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py index 8e8e54e8650a..8b61323a71ad 100644 --- a/tests/python/contrib/test_tensorrt.py +++ b/tests/python/contrib/test_tensorrt.py @@ -17,12 +17,15 @@ import numpy as np import time import pytest +import itertools import tvm import tvm.relay.testing from tvm import relay from tvm.relay.op.contrib import tensorrt -from tvm.contrib import graph_runtime +from tvm.contrib import graph_runtime, utils +from tvm.runtime.vm import VirtualMachine +from tvm.relay import Any, GlobalVar, transform def skip_codegen_test(): @@ -46,6 +49,23 @@ def skip_runtime_test(): return False +def vmobj_to_list(o): + if isinstance(o, tvm.nd.NDArray): + return [o.asnumpy()] + elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list): + return [vmobj_to_list(f) for f in o] + else: + raise RuntimeError("Unknown object type: %s" % type(o)) + + +def assert_result_dict_holds(result_dict): + for k1, k2 in itertools.combinations(result_dict, 2): + res1 = vmobj_to_list(result_dict[k1]) + res2 = vmobj_to_list(result_dict[k2]) + for r1, r2 in zip(res1, res2): + tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3) + + def run_and_verify_func(config, target="cuda"): """Test a Relay func by compiling, running, and comparing TVM and TRT outputs. @@ -64,100 +84,76 @@ def run_and_verify_func(config, target="cuda"): for k, v in input_shapes.items() if k not in is_param } - - # Run TRT - mod = tvm.IRModule() - mod["main"] = f - mod, config = tensorrt.partition_for_tensorrt(mod, params) - with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): - graph, lib, graph_params = relay.build(mod, target, params=params) - if skip_runtime_test(): - return ctx = tvm.context(target) - mod = graph_runtime.create(graph, lib, ctx=ctx) - mod.set_input(**graph_params) - mod.run(**input_dict) - results = [mod.get_output(i) for i in range(mod.get_num_outputs())] - - # Run reference - mod = tvm.IRModule() - mod["main"] = f - with tvm.transform.PassContext(opt_level=3): - graph, lib, graph_params = relay.build(mod, target, params=params) - mod = graph_runtime.create(graph, lib, ctx=ctx) - mod.set_input(**graph_params) - mod.run(**input_dict) - ref_results = [mod.get_output(i) for i in range(mod.get_num_outputs())] - assert len(results) == len(ref_results) - for i in range(len(results)): - res = results[i].asnumpy() - ref_res = ref_results[i].asnumpy() - assert res.shape == ref_res.shape - tvm.testing.assert_allclose(res, ref_res, rtol=1e-3, atol=1e-3) + result_dict = dict() + for mode in ["graph", "vm"]: + for use_trt in [False, True]: + mod = tvm.IRModule() + mod["main"] = f + result_key = mode + ("_trt" if use_trt else "") + if use_trt: + mod, config = tensorrt.partition_for_tensorrt(mod, params) + with tvm.transform.PassContext( + opt_level=3, config={"relay.ext.tensorrt.options": config} + ): + exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target) + else: + with tvm.transform.PassContext(opt_level=3): + exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target) + if not skip_runtime_test(): + result_dict[result_key] = exec.evaluate()(**input_dict, **params) + + if not skip_runtime_test(): + assert_result_dict_holds(result_dict) def run_and_verify_model(model): if skip_codegen_test(): return - def compile_and_run(i_data, input_shape, dtype, use_trt=True, num_iteration=1): - import mxnet as mx - from mxnet.gluon.model_zoo.vision import get_model - - def check_trt_used(graph): - import json + import mxnet as mx + from mxnet.gluon.model_zoo.vision import get_model - graph = json.loads(graph) - num_trt_subgraphs = sum( - [ - 1 - for n in graph["nodes"] - if n.get("attrs", {}).get("func_name", "").startswith("tensorrt_") - ] - ) - assert num_trt_subgraphs >= 1 + def check_trt_used(mod): + num_trt_subgraphs = sum( + [1 if gv.name_hint == "tensorrt_0" else 0 for gv in mod.get_global_vars()] + ) + assert num_trt_subgraphs == 1 - block = get_model(model, pretrained=True) - mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + def compile_and_run(mod, params, i_data, mode="vm", use_trt=True): + assert mode in ["graph", "vm"] if use_trt: mod, config = tensorrt.partition_for_tensorrt(mod, params) + check_trt_used(mod) with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config} ): - graph, lib, params = relay.build(mod, "cuda", params=params) - check_trt_used(graph) + exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") else: with tvm.transform.PassContext(opt_level=3): - graph, lib, params = relay.build(mod, "cuda", params=params) - - if skip_runtime_test(): - return - mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0)) - mod.set_input(**params) - # Warmup - for i in range(10): - mod.run(data=i_data) - # Time - times = [] - for i in range(num_iteration): - start_time = time.time() - mod.run(data=i_data) - res = mod.get_output(0) - times.append(time.time() - start_time) - latency = 1000.0 * np.mean(times) - print(model, latency) + exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") + + res = exec.evaluate()(i_data, **params) if not skip_runtime_test() else None return res dtype = "float32" input_shape = (1, 3, 224, 224) i_data = np.random.uniform(-1, 1, input_shape).astype(dtype) - res = compile_and_run(i_data, input_shape, dtype, use_trt=True) - if skip_runtime_test(): - return - ref_res = compile_and_run(i_data, input_shape, dtype, use_trt=False) - tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-3) + block = get_model(model, pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + + result_dict = dict() + for mode in ["vm", "graph"]: + for use_trt in [True, False]: + result_key = mode + ("_trt" if use_trt else "") + result_dict[result_key] = compile_and_run( + mod, params, i_data, mode=mode, use_trt=use_trt + ) + + if not skip_runtime_test(): + assert_result_dict_holds(result_dict) def test_tensorrt_simple(): @@ -174,19 +170,30 @@ def test_tensorrt_simple(): out = relay.nn.relu(w) f = relay.Function([x, y, z], out) - mod = tvm.IRModule() - mod["main"] = f - mod, config = tensorrt.partition_for_tensorrt(mod) - with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): - graph, lib, params = relay.build(mod, "cuda") - if skip_runtime_test(): - return - mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0)) x_data = np.random.uniform(-1, 1, xshape).astype(dtype) y_data = np.random.uniform(-1, 1, yshape).astype(dtype) z_data = np.random.uniform(-1, 1, zshape).astype(dtype) - mod.run(x=x_data, y=y_data, z=z_data) - results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())] + + result_dict = dict() + for mode in ["vm", "graph"]: + for use_trt in [True, False]: + mod = tvm.IRModule() + mod["main"] = f + result_key = mode + ("_trt" if use_trt else "") + if use_trt: + mod, config = tensorrt.partition_for_tensorrt(mod) + with tvm.transform.PassContext( + opt_level=3, config={"relay.ext.tensorrt.options": config} + ): + relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") + else: + with tvm.transform.PassContext(opt_level=3): + relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") + if not skip_runtime_test(): + result_dict[result_key] = relay_exec.evaluate()(x_data, y_data, z_data) + + if not skip_runtime_test(): + assert_result_dict_holds(result_dict) def test_tensorrt_simple_cpu_io(): @@ -211,6 +218,8 @@ def test_tensorrt_not_compatible(): return dtype = "float32" xshape = (1, 32, 14, 14) + x_data = np.random.uniform(-1, 1, xshape).astype(dtype) + x = relay.var("x", shape=(xshape), dtype=dtype) y = relay.add(x, x) z = relay.erf(y) @@ -219,40 +228,116 @@ def test_tensorrt_not_compatible(): mod = tvm.IRModule() mod["main"] = f mod, config = tensorrt.partition_for_tensorrt(mod) - with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): - graph, lib, params = relay.build(mod, "cuda") - if skip_runtime_test(): - return - mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0)) - x_data = np.random.uniform(-1, 1, xshape).astype(dtype) - mod.run(x=x_data) - results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())] + for mode in ["graph", "vm"]: + with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): + exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") + if not skip_runtime_test(): + results = exec.evaluate()(x_data) -def test_tensorrt_serialize(): +def test_tensorrt_serialize_graph_runtime(): if skip_codegen_test(): return - import mxnet + import mxnet as mx from mxnet.gluon.model_zoo.vision import get_model + data_shape = (1, 3, 224, 224) + data_type = "float32" + i_data = np.random.uniform(0, 1, data_shape).astype(data_type) block = get_model("resnet18_v1", pretrained=True) - mod, params = relay.frontend.from_mxnet( - block, shape={"data": (1, 3, 224, 224)}, dtype="float32" - ) - # Compile - mod, config = tensorrt.partition_for_tensorrt(mod, params) - with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): - lib = relay.build(mod, "cuda", params=params) - # Serialize - lib.export_library("compiled.so") - # Deserialize - loaded_lib = tvm.runtime.load_module("compiled.so") - # Run - if skip_runtime_test(): + mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype=data_type) + mod, config = tensorrt.partition_for_tensorrt(mod) + tmpdir = utils.tempdir() + + def compile_graph(mod, params): + with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): + graph, lib, params = relay.build(mod, params=params, target="cuda") + params = relay.save_param_dict(params) + return graph, lib, params + + def run_graph(graph, lib, params): + mod_ = graph_runtime.create(graph, lib, ctx=tvm.gpu(0)) + mod_.load_params(params) + mod_.run(data=i_data) + res = mod_.get_output(0) + return res + + def save_graph(graph, lib, params): + # Serialize + with open(tmpdir.relpath("compiled.json"), "w") as f_graph_json: + f_graph_json.write(graph) + with open(tmpdir.relpath("compiled.params"), "wb") as f_params: + f_params.write(params) + lib.export_library(tmpdir.relpath("compiled.so")) + + def load_graph(): + # Deserialize + with open(tmpdir.relpath("compiled.json"), "r") as f_graph_json: + graph = f_graph_json.read() + with open(tmpdir.relpath("compiled.params"), "rb") as f_params: + params = bytearray(f_params.read()) + lib = tvm.runtime.load_module(tmpdir.relpath("compiled.so")) + return graph, lib, params + + # Test serialization with graph runtime + graph, lib, graph_params = compile_graph(mod, params) + save_graph(graph, lib, graph_params) + loaded_graph, loaded_lib, loaded_params = load_graph() + + if not skip_runtime_test(): + result_dict = dict() + result_dict["graph"] = run_graph(graph, lib, graph_params) + result_dict["graph_ref"] = run_graph(loaded_graph, loaded_lib, loaded_params) + assert_result_dict_holds(result_dict) + + +def test_tensorrt_serialize_vm(): + if skip_codegen_test(): return - gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib["default"](tvm.gpu(0))) - i_data = np.random.uniform(0, 1, (1, 3, 224, 224)).astype("float32") - gen_module.run(data=i_data) + import mxnet as mx + from mxnet.gluon.model_zoo.vision import get_model + + data_shape = (1, 3, 224, 224) + data_type = "float32" + i_data = np.random.uniform(0, 1, data_shape).astype(data_type) + block = get_model("resnet18_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype=data_type) + mod, config = tensorrt.partition_for_tensorrt(mod) + tmpdir = utils.tempdir() + + def compile_vm(mod, params): + with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): + vm_exec = relay.vm.compile(mod, target="cuda", params=params) + code, lib = vm_exec.save() + return code, lib + + def run_vm(code, lib): + vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib) + vm = VirtualMachine(vm_exec, tvm.gpu(0)) + result = vm.invoke("main", data=i_data) + return result + + def save_vm(code, lib): + # save and load the code and lib file. + lib.export_library(tmpdir.relpath("path_lib.so")) + with open(tmpdir.relpath("path_code.ro"), "wb") as fo: + fo.write(code) + + def load_vm(): + lib = tvm.runtime.load_module(tmpdir.relpath("path_lib.so")) + code = bytearray(open(tmpdir.relpath("path_code.ro"), "rb").read()) + return lib, code + + # Test serialization with VM + code_vm, lib_vm = compile_vm(mod, params) + save_vm(code_vm, lib_vm) + loaded_lib_vm, loaded_code_vm = load_vm() + + if not skip_runtime_test(): + result_dict = dict() + result_dict["vm"] = run_vm(code_vm, lib_vm) + result_dict["vm_ref"] = run_vm(loaded_code_vm, loaded_lib_vm) + assert_result_dict_holds(result_dict) def test_conv2d(): @@ -701,27 +786,40 @@ def get_graph(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False): def test_strided_slice(): - def get_graph(x_shape, begin, end, strides=None): + def get_graph(x_shape, begin, end, strides=None, slice_mode="size"): x = relay.var("x", shape=(x_shape), dtype="float32") if strides: out = relay.strided_slice( x, - relay.expr.const(begin, dtype="int32"), - relay.expr.const(end, dtype="int32"), - relay.expr.const(strides, dtype="int32"), + begin, + end, + strides, + slice_mode=slice_mode, ) else: out = relay.strided_slice( x, - relay.expr.const(begin, dtype="int32"), - relay.expr.const(end, dtype="int32"), + begin, + end, + slice_mode=slice_mode, ) f = relay.Function([x], out) return f, {"x": x_shape}, [] - run_and_verify_func(get_graph((1, 3, 6, 7), [0, 0, 0, 0], [1, 1, 6, 7])) - run_and_verify_func(get_graph((1, 3, 6, 7), [0, 1, 0, 0], [1, 2, 6, 6])) - run_and_verify_func(get_graph((1, 10), [0, 0], [1, 10], [1, 2])) + for slice_mode in ["size", "end"]: + run_and_verify_func( + get_graph((1, 3, 6, 7), (0, 0, 0, 0), (1, 1, 6, 7), slice_mode=slice_mode) + ) + run_and_verify_func( + get_graph((1, 3, 6, 7), [0, 1, 0, 0], [1, 2, 6, 6], slice_mode=slice_mode) + ) + run_and_verify_func( + get_graph((2, 3, 6, 7), [0, 0, 0, 0], [-1, -1, -1, -1], slice_mode=slice_mode) + ) + run_and_verify_func( + get_graph((2, 3, 6, 7), [0, 1, 0, 0], [-1, -1, -1, -1], slice_mode=slice_mode) + ) + run_and_verify_func(get_graph((1, 6), [0, 1], [1, 3], slice_mode=slice_mode)) def test_adaptive_pool2d(): @@ -874,50 +972,67 @@ def test_densenet121(): run_and_verify_model("densenet121") +def test_dynamic_offload(): + """ + This test checks for proper dynamic offloading of relay graphs. An addition between + the outputs of two conv2d's is performed, one of them having all static args whereas + the other has a arg with dynamic shape. It is expected for the TRT partitioner to + offload the conv2d with dynamic arg to TVM while running the other in TRT. + """ + + if skip_codegen_test(): + return + + data_shape = (1, 32, 8, 8) + k_shape = (1, 32, 3, 3) + + x = relay.var("x", shape=(data_shape[0], data_shape[1], Any(), Any()), dtype="float32") + y = relay.var("y", shape=(data_shape), dtype="float32") + kernel = relay.var("kernel", shape=(k_shape), dtype="float32") + + def get_expected(): + def set_func_attr(func, compile_name, symbol_name): + func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) + func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1)) + func = func.with_attr("Compiler", compile_name) + func = func.with_attr("global_symbol", symbol_name) + return func + + # Create a nested TRT function that matches the expected output + mod = tvm.IRModule() + var1 = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32") + kernel_trt = relay.var("tensorrt_0_i1", shape=(k_shape), dtype="float32") + out1 = relay.nn.conv2d(var1, kernel_trt, channels=k_shape[0], kernel_size=k_shape[2:4]) + f1 = GlobalVar("tensorrt_0") + func = relay.Function([var1, kernel_trt], out1) + func = set_func_attr(func, "tensorrt", "tensorrt_0") + mod[f1] = func + mod = relay.transform.InferType()(mod) + + # Create the main function + out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) + out = relay.add(out1, f1(y, kernel)) + f = relay.Function([x, y, kernel], out) + mod["main"] = f + mod = relay.transform.InferType()(mod) + return mod + + # Create relay function that will be offloaded to TRT + out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) + out2 = relay.nn.conv2d(y, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) + out = relay.add(out1, out2) + f = relay.Function([x, y, kernel], out) + + # Pass the function to TRT compilation + mod = tvm.IRModule() + mod["main"] = f + mod = relay.transform.InferType()(mod) + mod_trt, config = tensorrt.partition_for_tensorrt(mod, params={}) + + # Get the expected relay graph and compare + mod_exp = get_expected() + tvm.ir.assert_structural_equal(mod_trt, mod_exp, map_free_vars=True) + + if __name__ == "__main__": - test_tensorrt_not_compatible() - test_tensorrt_simple() - test_tensorrt_simple_cpu_io() - test_tensorrt_serialize() - - # Op tests - test_conv2d() - test_conv2d_nhwc() - test_conv2d_weights_const() - test_conv2d_weights_transposed() - test_dense() - test_bias_add() - test_pool2d() - test_global_pool2d() - test_batch_flatten() - test_expand_dims() - test_squeeze() - test_concatenate() - test_conv2d_transpose() - test_reshape() - test_transpose() - test_float_const() - test_pad() - test_softmax() - test_batch_norm() - test_unary() - test_clip() - test_leaky_relu() - test_binary() - test_reduce() - test_strided_slice() - test_adaptive_pool2d() - test_multiple_outputs() - test_conv3d() - test_pool3d() - test_conv3d_transpose() - - # Integration tests - test_alexnet() - test_resnet18_v1() - test_resnet18_v2() - test_squeezenet() - test_mobilenet() - test_mobilenet_v2() - test_vgg11() - test_densenet121() + pytest.main([__file__])